Mercurial > libavcodec.hg
annotate dsputil.c @ 10893:2aafcafbe1f0 libavcodec
Replace cabac checks in inline functions from h264.h with constants.
No benchmark because its just replacing variables with litteral constants
(so no risk for slowdown outside gcc silliness) and i need sleep.
author | michael |
---|---|
date | Sat, 16 Jan 2010 05:41:33 +0000 |
parents | a8620b001ed3 |
children | 563cb9b1a9b7 |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8627
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
0 | 5 * |
5214 | 6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> |
7 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
8 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
9 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
10 * FFmpeg is free software; you can redistribute it and/or |
429 | 11 * modify it under the terms of the GNU Lesser General Public |
12 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
13 * version 2.1 of the License, or (at your option) any later version. |
0 | 14 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
15 * FFmpeg is distributed in the hope that it will be useful, |
0 | 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 * Lesser General Public License for more details. | |
0 | 19 * |
429 | 20 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
21 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
3029
diff
changeset
|
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
0 | 23 */ |
2967 | 24 |
1106 | 25 /** |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
8629
diff
changeset
|
26 * @file libavcodec/dsputil.c |
1106 | 27 * DSP utils |
28 */ | |
2967 | 29 |
0 | 30 #include "avcodec.h" |
31 #include "dsputil.h" | |
1092 | 32 #include "simple_idct.h" |
1557 | 33 #include "faandct.h" |
6407 | 34 #include "faanidct.h" |
8627
d6bab465b82c
moves mid_pred() into mathops.h (with arch specific code split by directory)
aurel
parents:
8596
diff
changeset
|
35 #include "mathops.h" |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
36 #include "snow.h" |
10748
36611425fedb
Add required header #includes for mpegvideo.h and config.h.
diego
parents:
10644
diff
changeset
|
37 #include "mpegvideo.h" |
36611425fedb
Add required header #includes for mpegvideo.h and config.h.
diego
parents:
10644
diff
changeset
|
38 #include "config.h" |
676 | 39 |
2522
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
40 /* snow.c */ |
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
41 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count); |
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
42 |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
43 /* vorbis.c */ |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
44 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
45 |
7563 | 46 /* ac3dec.c */ |
47 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len); | |
48 | |
10424
94595d0e617c
Move autocorrelation function from flacenc.c to lpc.c. Also rename the
jbr
parents:
10421
diff
changeset
|
49 /* lpc.c */ |
94595d0e617c
Move autocorrelation function from flacenc.c to lpc.c. Also rename the
jbr
parents:
10421
diff
changeset
|
50 void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc); |
5737 | 51 |
6384 | 52 /* pngdec.c */ |
53 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); | |
54 | |
8120 | 55 /* eaidct.c */ |
56 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); | |
57 | |
4176 | 58 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; |
4179 | 59 uint32_t ff_squareTbl[512] = {0, }; |
0 | 60 |
6387 | 61 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size |
62 #define pb_7f (~0UL/255 * 0x7f) | |
63 #define pb_80 (~0UL/255 * 0x80) | |
6385 | 64 |
1064 | 65 const uint8_t ff_zigzag_direct[64] = { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
66 0, 1, 8, 16, 9, 2, 3, 10, |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
67 17, 24, 32, 25, 18, 11, 4, 5, |
34 | 68 12, 19, 26, 33, 40, 48, 41, 34, |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
69 27, 20, 13, 6, 7, 14, 21, 28, |
34 | 70 35, 42, 49, 56, 57, 50, 43, 36, |
71 29, 22, 15, 23, 30, 37, 44, 51, | |
72 58, 59, 52, 45, 38, 31, 39, 46, | |
73 53, 60, 61, 54, 47, 55, 62, 63 | |
74 }; | |
75 | |
1567 | 76 /* Specific zigzag scan for 248 idct. NOTE that unlike the |
77 specification, we interleave the fields */ | |
78 const uint8_t ff_zigzag248_direct[64] = { | |
79 0, 8, 1, 9, 16, 24, 2, 10, | |
80 17, 25, 32, 40, 48, 56, 33, 41, | |
81 18, 26, 3, 11, 4, 12, 19, 27, | |
82 34, 42, 49, 57, 50, 58, 35, 43, | |
83 20, 28, 5, 13, 6, 14, 21, 29, | |
84 36, 44, 51, 59, 52, 60, 37, 45, | |
85 22, 30, 7, 15, 23, 31, 38, 46, | |
86 53, 61, 54, 62, 39, 47, 55, 63, | |
87 }; | |
88 | |
220 | 89 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */ |
10090
1315878bc455
100l, inv_zigzag_direct16 must be aligned to 16 bytes for dct_quantize_SSE2
reimar
parents:
10085
diff
changeset
|
90 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]); |
220 | 91 |
1064 | 92 const uint8_t ff_alternate_horizontal_scan[64] = { |
2967 | 93 0, 1, 2, 3, 8, 9, 16, 17, |
34 | 94 10, 11, 4, 5, 6, 7, 15, 14, |
2967 | 95 13, 12, 19, 18, 24, 25, 32, 33, |
34 | 96 26, 27, 20, 21, 22, 23, 28, 29, |
2967 | 97 30, 31, 34, 35, 40, 41, 48, 49, |
34 | 98 42, 43, 36, 37, 38, 39, 44, 45, |
2967 | 99 46, 47, 50, 51, 56, 57, 58, 59, |
34 | 100 52, 53, 54, 55, 60, 61, 62, 63, |
101 }; | |
102 | |
1064 | 103 const uint8_t ff_alternate_vertical_scan[64] = { |
2967 | 104 0, 8, 16, 24, 1, 9, 2, 10, |
34 | 105 17, 25, 32, 40, 48, 56, 57, 49, |
2967 | 106 41, 33, 26, 18, 3, 11, 4, 12, |
34 | 107 19, 27, 34, 42, 50, 58, 35, 43, |
2967 | 108 51, 59, 20, 28, 5, 13, 6, 14, |
34 | 109 21, 29, 36, 44, 52, 60, 37, 45, |
2967 | 110 53, 61, 22, 30, 7, 15, 23, 31, |
34 | 111 38, 46, 54, 62, 39, 47, 55, 63, |
112 }; | |
113 | |
10207 | 114 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256 |
115 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */ | |
116 const uint32_t ff_inverse[257]={ | |
2967 | 117 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757, |
118 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154, | |
119 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709, | |
120 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333, | |
121 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367, | |
122 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283, | |
123 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315, | |
124 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085, | |
125 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498, | |
126 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675, | |
127 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441, | |
128 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183, | |
129 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712, | |
130 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400, | |
131 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163, | |
132 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641, | |
133 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573, | |
134 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737, | |
135 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493, | |
136 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373, | |
137 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368, | |
138 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671, | |
139 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767, | |
140 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740, | |
141 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751, | |
142 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635, | |
143 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593, | |
144 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944, | |
145 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933, | |
146 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575, | |
147 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532, | |
220 | 148 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010, |
10207 | 149 16777216 |
220 | 150 }; |
151 | |
1092 | 152 /* Input permutation for the simple_idct_mmx */ |
153 static const uint8_t simple_mmx_permutation[64]={ | |
2979 | 154 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, |
155 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, | |
156 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, | |
157 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, | |
158 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, | |
159 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, | |
160 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, | |
161 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, | |
1092 | 162 }; |
163 | |
6600
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
164 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7}; |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
165 |
6438 | 166 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){ |
167 int i; | |
168 int end; | |
169 | |
170 st->scantable= src_scantable; | |
171 | |
172 for(i=0; i<64; i++){ | |
173 int j; | |
174 j = src_scantable[i]; | |
175 st->permutated[i] = permutation[j]; | |
8590 | 176 #if ARCH_PPC |
6438 | 177 st->inverse[j] = i; |
178 #endif | |
179 } | |
180 | |
181 end=-1; | |
182 for(i=0; i<64; i++){ | |
183 int j; | |
184 j = st->permutated[i]; | |
185 if(j>end) end=j; | |
186 st->raster_end[i]= end; | |
187 } | |
188 } | |
189 | |
1064 | 190 static int pix_sum_c(uint8_t * pix, int line_size) |
612 | 191 { |
192 int s, i, j; | |
193 | |
194 s = 0; | |
195 for (i = 0; i < 16; i++) { | |
2979 | 196 for (j = 0; j < 16; j += 8) { |
197 s += pix[0]; | |
198 s += pix[1]; | |
199 s += pix[2]; | |
200 s += pix[3]; | |
201 s += pix[4]; | |
202 s += pix[5]; | |
203 s += pix[6]; | |
204 s += pix[7]; | |
205 pix += 8; | |
206 } | |
207 pix += line_size - 16; | |
612 | 208 } |
209 return s; | |
210 } | |
211 | |
1064 | 212 static int pix_norm1_c(uint8_t * pix, int line_size) |
612 | 213 { |
214 int s, i, j; | |
4179 | 215 uint32_t *sq = ff_squareTbl + 256; |
612 | 216 |
217 s = 0; | |
218 for (i = 0; i < 16; i++) { | |
2979 | 219 for (j = 0; j < 16; j += 8) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
220 #if 0 |
2979 | 221 s += sq[pix[0]]; |
222 s += sq[pix[1]]; | |
223 s += sq[pix[2]]; | |
224 s += sq[pix[3]]; | |
225 s += sq[pix[4]]; | |
226 s += sq[pix[5]]; | |
227 s += sq[pix[6]]; | |
228 s += sq[pix[7]]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
229 #else |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
230 #if LONG_MAX > 2147483647 |
2979 | 231 register uint64_t x=*(uint64_t*)pix; |
232 s += sq[x&0xff]; | |
233 s += sq[(x>>8)&0xff]; | |
234 s += sq[(x>>16)&0xff]; | |
235 s += sq[(x>>24)&0xff]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
236 s += sq[(x>>32)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
237 s += sq[(x>>40)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
238 s += sq[(x>>48)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
239 s += sq[(x>>56)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
240 #else |
2979 | 241 register uint32_t x=*(uint32_t*)pix; |
242 s += sq[x&0xff]; | |
243 s += sq[(x>>8)&0xff]; | |
244 s += sq[(x>>16)&0xff]; | |
245 s += sq[(x>>24)&0xff]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
246 x=*(uint32_t*)(pix+4); |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
247 s += sq[x&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
248 s += sq[(x>>8)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
249 s += sq[(x>>16)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
250 s += sq[(x>>24)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
251 #endif |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
252 #endif |
2979 | 253 pix += 8; |
254 } | |
255 pix += line_size - 16; | |
612 | 256 } |
257 return s; | |
258 } | |
259 | |
6241 | 260 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){ |
1273 | 261 int i; |
2967 | 262 |
1273 | 263 for(i=0; i+8<=w; i+=8){ |
264 dst[i+0]= bswap_32(src[i+0]); | |
265 dst[i+1]= bswap_32(src[i+1]); | |
266 dst[i+2]= bswap_32(src[i+2]); | |
267 dst[i+3]= bswap_32(src[i+3]); | |
268 dst[i+4]= bswap_32(src[i+4]); | |
269 dst[i+5]= bswap_32(src[i+5]); | |
270 dst[i+6]= bswap_32(src[i+6]); | |
271 dst[i+7]= bswap_32(src[i+7]); | |
272 } | |
273 for(;i<w; i++){ | |
274 dst[i+0]= bswap_32(src[i+0]); | |
275 } | |
276 } | |
612 | 277 |
2184 | 278 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
279 { | |
280 int s, i; | |
4179 | 281 uint32_t *sq = ff_squareTbl + 256; |
2184 | 282 |
283 s = 0; | |
284 for (i = 0; i < h; i++) { | |
285 s += sq[pix1[0] - pix2[0]]; | |
286 s += sq[pix1[1] - pix2[1]]; | |
287 s += sq[pix1[2] - pix2[2]]; | |
288 s += sq[pix1[3] - pix2[3]]; | |
289 pix1 += line_size; | |
290 pix2 += line_size; | |
291 } | |
292 return s; | |
293 } | |
294 | |
1708 | 295 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
936 | 296 { |
297 int s, i; | |
4179 | 298 uint32_t *sq = ff_squareTbl + 256; |
936 | 299 |
300 s = 0; | |
1708 | 301 for (i = 0; i < h; i++) { |
936 | 302 s += sq[pix1[0] - pix2[0]]; |
303 s += sq[pix1[1] - pix2[1]]; | |
304 s += sq[pix1[2] - pix2[2]]; | |
305 s += sq[pix1[3] - pix2[3]]; | |
306 s += sq[pix1[4] - pix2[4]]; | |
307 s += sq[pix1[5] - pix2[5]]; | |
308 s += sq[pix1[6] - pix2[6]]; | |
309 s += sq[pix1[7] - pix2[7]]; | |
310 pix1 += line_size; | |
311 pix2 += line_size; | |
312 } | |
313 return s; | |
314 } | |
315 | |
1708 | 316 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
884 | 317 { |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
318 int s, i; |
4179 | 319 uint32_t *sq = ff_squareTbl + 256; |
884 | 320 |
321 s = 0; | |
1708 | 322 for (i = 0; i < h; i++) { |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
323 s += sq[pix1[ 0] - pix2[ 0]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
324 s += sq[pix1[ 1] - pix2[ 1]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
325 s += sq[pix1[ 2] - pix2[ 2]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
326 s += sq[pix1[ 3] - pix2[ 3]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
327 s += sq[pix1[ 4] - pix2[ 4]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
328 s += sq[pix1[ 5] - pix2[ 5]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
329 s += sq[pix1[ 6] - pix2[ 6]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
330 s += sq[pix1[ 7] - pix2[ 7]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
331 s += sq[pix1[ 8] - pix2[ 8]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
332 s += sq[pix1[ 9] - pix2[ 9]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
333 s += sq[pix1[10] - pix2[10]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
334 s += sq[pix1[11] - pix2[11]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
335 s += sq[pix1[12] - pix2[12]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
336 s += sq[pix1[13] - pix2[13]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
337 s += sq[pix1[14] - pix2[14]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
338 s += sq[pix1[15] - pix2[15]]; |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
339 |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
340 pix1 += line_size; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
341 pix2 += line_size; |
884 | 342 } |
343 return s; | |
344 } | |
345 | |
2184 | 346 |
8590 | 347 #if CONFIG_SNOW_ENCODER //dwt is in snow.c |
2184 | 348 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){ |
349 int s, i, j; | |
350 const int dec_count= w==8 ? 3 : 4; | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
351 int tmp[32*32]; |
2184 | 352 int level, ori; |
2967 | 353 static const int scale[2][2][4][4]={ |
2184 | 354 { |
355 { | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
356 // 9/7 8x8 dec=3 |
2184 | 357 {268, 239, 239, 213}, |
358 { 0, 224, 224, 152}, | |
359 { 0, 135, 135, 110}, | |
360 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
361 // 9/7 16x16 or 32x32 dec=4 |
2184 | 362 {344, 310, 310, 280}, |
363 { 0, 320, 320, 228}, | |
364 { 0, 175, 175, 136}, | |
365 { 0, 129, 129, 102}, | |
366 } | |
367 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
368 { |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
369 // 5/3 8x8 dec=3 |
2184 | 370 {275, 245, 245, 218}, |
371 { 0, 230, 230, 156}, | |
372 { 0, 138, 138, 113}, | |
373 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
374 // 5/3 16x16 or 32x32 dec=4 |
2184 | 375 {352, 317, 317, 286}, |
376 { 0, 328, 328, 233}, | |
377 { 0, 180, 180, 140}, | |
378 { 0, 132, 132, 105}, | |
379 } | |
380 } | |
381 }; | |
382 | |
383 for (i = 0; i < h; i++) { | |
384 for (j = 0; j < w; j+=4) { | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
385 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
386 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
387 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
388 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4; |
2184 | 389 } |
390 pix1 += line_size; | |
391 pix2 += line_size; | |
392 } | |
2639 | 393 |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
394 ff_spatial_dwt(tmp, w, h, 32, type, dec_count); |
2184 | 395 |
396 s=0; | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
397 assert(w==h); |
2184 | 398 for(level=0; level<dec_count; level++){ |
399 for(ori= level ? 1 : 0; ori<4; ori++){ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
400 int size= w>>(dec_count-level); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
401 int sx= (ori&1) ? size : 0; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
402 int stride= 32<<(dec_count-level); |
2184 | 403 int sy= (ori&2) ? stride>>1 : 0; |
2967 | 404 |
2184 | 405 for(i=0; i<size; i++){ |
406 for(j=0; j<size; j++){ | |
407 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori]; | |
4001 | 408 s += FFABS(v); |
2184 | 409 } |
410 } | |
411 } | |
412 } | |
2967 | 413 assert(s>=0); |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
414 return s>>9; |
2184 | 415 } |
416 | |
417 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
418 return w_c(v, pix1, pix2, line_size, 8, h, 1); | |
419 } | |
420 | |
421 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
422 return w_c(v, pix1, pix2, line_size, 8, h, 0); | |
423 } | |
424 | |
425 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
426 return w_c(v, pix1, pix2, line_size, 16, h, 1); | |
427 } | |
428 | |
429 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
430 return w_c(v, pix1, pix2, line_size, 16, h, 0); | |
431 } | |
432 | |
4197 | 433 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
434 return w_c(v, pix1, pix2, line_size, 32, h, 1); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
435 } |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
436 |
4197 | 437 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
438 return w_c(v, pix1, pix2, line_size, 32, h, 0); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
439 } |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
440 #endif |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
441 |
6437 | 442 /* draw the edges of width 'w' of an image of size width, height */ |
443 //FIXME check that this is ok for mpeg4 interlaced | |
444 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w) | |
445 { | |
446 uint8_t *ptr, *last_line; | |
447 int i; | |
448 | |
449 last_line = buf + (height - 1) * wrap; | |
450 for(i=0;i<w;i++) { | |
451 /* top and bottom */ | |
452 memcpy(buf - (i + 1) * wrap, buf, width); | |
453 memcpy(last_line + (i + 1) * wrap, last_line, width); | |
454 } | |
455 /* left and right */ | |
456 ptr = buf; | |
457 for(i=0;i<height;i++) { | |
458 memset(ptr - w, ptr[0], w); | |
459 memset(ptr + width, ptr[width-1], w); | |
460 ptr += wrap; | |
461 } | |
462 /* corners */ | |
463 for(i=0;i<w;i++) { | |
464 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */ | |
465 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */ | |
466 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */ | |
467 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */ | |
468 } | |
469 } | |
470 | |
6445 | 471 /** |
472 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples. | |
473 * @param buf destination buffer | |
474 * @param src source buffer | |
475 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers | |
476 * @param block_w width of block | |
477 * @param block_h height of block | |
478 * @param src_x x coordinate of the top left sample of the block in the source buffer | |
479 * @param src_y y coordinate of the top left sample of the block in the source buffer | |
480 * @param w width of the source buffer | |
481 * @param h height of the source buffer | |
482 */ | |
483 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h, | |
484 int src_x, int src_y, int w, int h){ | |
485 int x, y; | |
486 int start_y, start_x, end_y, end_x; | |
487 | |
488 if(src_y>= h){ | |
489 src+= (h-1-src_y)*linesize; | |
490 src_y=h-1; | |
491 }else if(src_y<=-block_h){ | |
492 src+= (1-block_h-src_y)*linesize; | |
493 src_y=1-block_h; | |
494 } | |
495 if(src_x>= w){ | |
496 src+= (w-1-src_x); | |
497 src_x=w-1; | |
498 }else if(src_x<=-block_w){ | |
499 src+= (1-block_w-src_x); | |
500 src_x=1-block_w; | |
501 } | |
502 | |
503 start_y= FFMAX(0, -src_y); | |
504 start_x= FFMAX(0, -src_x); | |
505 end_y= FFMIN(block_h, h-src_y); | |
506 end_x= FFMIN(block_w, w-src_x); | |
507 | |
508 // copy existing part | |
509 for(y=start_y; y<end_y; y++){ | |
510 for(x=start_x; x<end_x; x++){ | |
511 buf[x + y*linesize]= src[x + y*linesize]; | |
512 } | |
513 } | |
514 | |
515 //top | |
516 for(y=0; y<start_y; y++){ | |
517 for(x=start_x; x<end_x; x++){ | |
518 buf[x + y*linesize]= buf[x + start_y*linesize]; | |
519 } | |
520 } | |
521 | |
522 //bottom | |
523 for(y=end_y; y<block_h; y++){ | |
524 for(x=start_x; x<end_x; x++){ | |
525 buf[x + y*linesize]= buf[x + (end_y-1)*linesize]; | |
526 } | |
527 } | |
528 | |
529 for(y=0; y<block_h; y++){ | |
530 //left | |
531 for(x=0; x<start_x; x++){ | |
532 buf[x + y*linesize]= buf[start_x + y*linesize]; | |
533 } | |
534 | |
535 //right | |
536 for(x=end_x; x<block_w; x++){ | |
537 buf[x + y*linesize]= buf[end_x - 1 + y*linesize]; | |
538 } | |
539 } | |
540 } | |
541 | |
1064 | 542 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |
0 | 543 { |
544 int i; | |
545 | |
546 /* read the pixels */ | |
547 for(i=0;i<8;i++) { | |
516 | 548 block[0] = pixels[0]; |
549 block[1] = pixels[1]; | |
550 block[2] = pixels[2]; | |
551 block[3] = pixels[3]; | |
552 block[4] = pixels[4]; | |
553 block[5] = pixels[5]; | |
554 block[6] = pixels[6]; | |
555 block[7] = pixels[7]; | |
556 pixels += line_size; | |
557 block += 8; | |
0 | 558 } |
559 } | |
560 | |
1064 | 561 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1, |
2979 | 562 const uint8_t *s2, int stride){ |
324 | 563 int i; |
564 | |
565 /* read the pixels */ | |
566 for(i=0;i<8;i++) { | |
516 | 567 block[0] = s1[0] - s2[0]; |
568 block[1] = s1[1] - s2[1]; | |
569 block[2] = s1[2] - s2[2]; | |
570 block[3] = s1[3] - s2[3]; | |
571 block[4] = s1[4] - s2[4]; | |
572 block[5] = s1[5] - s2[5]; | |
573 block[6] = s1[6] - s2[6]; | |
574 block[7] = s1[7] - s2[7]; | |
324 | 575 s1 += stride; |
576 s2 += stride; | |
516 | 577 block += 8; |
324 | 578 } |
579 } | |
580 | |
581 | |
1064 | 582 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 583 int line_size) |
0 | 584 { |
585 int i; | |
4176 | 586 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 587 |
0 | 588 /* read the pixels */ |
589 for(i=0;i<8;i++) { | |
516 | 590 pixels[0] = cm[block[0]]; |
591 pixels[1] = cm[block[1]]; | |
592 pixels[2] = cm[block[2]]; | |
593 pixels[3] = cm[block[3]]; | |
594 pixels[4] = cm[block[4]]; | |
595 pixels[5] = cm[block[5]]; | |
596 pixels[6] = cm[block[6]]; | |
597 pixels[7] = cm[block[7]]; | |
598 | |
599 pixels += line_size; | |
600 block += 8; | |
0 | 601 } |
602 } | |
603 | |
2256 | 604 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 605 int line_size) |
2256 | 606 { |
607 int i; | |
4176 | 608 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 609 |
2256 | 610 /* read the pixels */ |
611 for(i=0;i<4;i++) { | |
612 pixels[0] = cm[block[0]]; | |
613 pixels[1] = cm[block[1]]; | |
614 pixels[2] = cm[block[2]]; | |
615 pixels[3] = cm[block[3]]; | |
616 | |
617 pixels += line_size; | |
618 block += 8; | |
619 } | |
620 } | |
621 | |
2257 | 622 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 623 int line_size) |
2257 | 624 { |
625 int i; | |
4176 | 626 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 627 |
2257 | 628 /* read the pixels */ |
629 for(i=0;i<2;i++) { | |
630 pixels[0] = cm[block[0]]; | |
631 pixels[1] = cm[block[1]]; | |
632 | |
633 pixels += line_size; | |
634 block += 8; | |
635 } | |
636 } | |
637 | |
2967 | 638 static void put_signed_pixels_clamped_c(const DCTELEM *block, |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
639 uint8_t *restrict pixels, |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
640 int line_size) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
641 { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
642 int i, j; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
643 |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
644 for (i = 0; i < 8; i++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
645 for (j = 0; j < 8; j++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
646 if (*block < -128) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
647 *pixels = 0; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
648 else if (*block > 127) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
649 *pixels = 255; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
650 else |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
651 *pixels = (uint8_t)(*block + 128); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
652 block++; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
653 pixels++; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
654 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
655 pixels += (line_size - 8); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
656 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
657 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
658 |
1064 | 659 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, |
516 | 660 int line_size) |
0 | 661 { |
662 int i; | |
4176 | 663 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 664 |
0 | 665 /* read the pixels */ |
666 for(i=0;i<8;i++) { | |
516 | 667 pixels[0] = cm[pixels[0] + block[0]]; |
668 pixels[1] = cm[pixels[1] + block[1]]; | |
669 pixels[2] = cm[pixels[2] + block[2]]; | |
670 pixels[3] = cm[pixels[3] + block[3]]; | |
671 pixels[4] = cm[pixels[4] + block[4]]; | |
672 pixels[5] = cm[pixels[5] + block[5]]; | |
673 pixels[6] = cm[pixels[6] + block[6]]; | |
674 pixels[7] = cm[pixels[7] + block[7]]; | |
675 pixels += line_size; | |
676 block += 8; | |
0 | 677 } |
678 } | |
2256 | 679 |
680 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels, | |
681 int line_size) | |
682 { | |
683 int i; | |
4176 | 684 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 685 |
2256 | 686 /* read the pixels */ |
687 for(i=0;i<4;i++) { | |
688 pixels[0] = cm[pixels[0] + block[0]]; | |
689 pixels[1] = cm[pixels[1] + block[1]]; | |
690 pixels[2] = cm[pixels[2] + block[2]]; | |
691 pixels[3] = cm[pixels[3] + block[3]]; | |
692 pixels += line_size; | |
693 block += 8; | |
694 } | |
695 } | |
2257 | 696 |
697 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels, | |
698 int line_size) | |
699 { | |
700 int i; | |
4176 | 701 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 702 |
2257 | 703 /* read the pixels */ |
704 for(i=0;i<2;i++) { | |
705 pixels[0] = cm[pixels[0] + block[0]]; | |
706 pixels[1] = cm[pixels[1] + block[1]]; | |
707 pixels += line_size; | |
708 block += 8; | |
709 } | |
710 } | |
2763 | 711 |
712 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size) | |
713 { | |
714 int i; | |
715 for(i=0;i<8;i++) { | |
716 pixels[0] += block[0]; | |
717 pixels[1] += block[1]; | |
718 pixels[2] += block[2]; | |
719 pixels[3] += block[3]; | |
720 pixels[4] += block[4]; | |
721 pixels[5] += block[5]; | |
722 pixels[6] += block[6]; | |
723 pixels[7] += block[7]; | |
724 pixels += line_size; | |
725 block += 8; | |
726 } | |
727 } | |
728 | |
729 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size) | |
730 { | |
731 int i; | |
732 for(i=0;i<4;i++) { | |
733 pixels[0] += block[0]; | |
734 pixels[1] += block[1]; | |
735 pixels[2] += block[2]; | |
736 pixels[3] += block[3]; | |
737 pixels += line_size; | |
738 block += 4; | |
739 } | |
740 } | |
741 | |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
742 static int sum_abs_dctelem_c(DCTELEM *block) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
743 { |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
744 int sum=0, i; |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
745 for(i=0; i<64; i++) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
746 sum+= FFABS(block[i]); |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
747 return sum; |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
748 } |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
749 |
385 | 750 #if 0 |
751 | |
752 #define PIXOP2(OPNAME, OP) \ | |
651 | 753 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 754 {\ |
755 int i;\ | |
756 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
757 OP(*((uint64_t*)block), AV_RN64(pixels));\ |
385 | 758 pixels+=line_size;\ |
759 block +=line_size;\ | |
760 }\ | |
761 }\ | |
762 \ | |
859 | 763 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 764 {\ |
765 int i;\ | |
766 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
767 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
768 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 769 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
770 pixels+=line_size;\ | |
771 block +=line_size;\ | |
772 }\ | |
773 }\ | |
774 \ | |
859 | 775 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 776 {\ |
777 int i;\ | |
778 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
779 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
780 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 781 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
782 pixels+=line_size;\ | |
783 block +=line_size;\ | |
784 }\ | |
785 }\ | |
786 \ | |
859 | 787 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 788 {\ |
789 int i;\ | |
790 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
791 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
792 const uint64_t b= AV_RN64(pixels+line_size);\ |
385 | 793 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
794 pixels+=line_size;\ | |
795 block +=line_size;\ | |
796 }\ | |
797 }\ | |
798 \ | |
859 | 799 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 800 {\ |
801 int i;\ | |
802 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
803 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
804 const uint64_t b= AV_RN64(pixels+line_size);\ |
385 | 805 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
806 pixels+=line_size;\ | |
807 block +=line_size;\ | |
808 }\ | |
809 }\ | |
810 \ | |
859 | 811 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 812 {\ |
813 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
814 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
815 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 816 uint64_t l0= (a&0x0303030303030303ULL)\ |
817 + (b&0x0303030303030303ULL)\ | |
818 + 0x0202020202020202ULL;\ | |
819 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
820 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
821 uint64_t l1,h1;\ | |
822 \ | |
823 pixels+=line_size;\ | |
824 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
825 uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
826 uint64_t b= AV_RN64(pixels+1);\ |
385 | 827 l1= (a&0x0303030303030303ULL)\ |
828 + (b&0x0303030303030303ULL);\ | |
829 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
830 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
831 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
832 pixels+=line_size;\ | |
833 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
834 a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
835 b= AV_RN64(pixels+1);\ |
385 | 836 l0= (a&0x0303030303030303ULL)\ |
837 + (b&0x0303030303030303ULL)\ | |
838 + 0x0202020202020202ULL;\ | |
839 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
840 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
841 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
842 pixels+=line_size;\ | |
843 block +=line_size;\ | |
844 }\ | |
845 }\ | |
846 \ | |
859 | 847 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 848 {\ |
849 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
850 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
851 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 852 uint64_t l0= (a&0x0303030303030303ULL)\ |
853 + (b&0x0303030303030303ULL)\ | |
854 + 0x0101010101010101ULL;\ | |
855 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
856 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
857 uint64_t l1,h1;\ | |
858 \ | |
859 pixels+=line_size;\ | |
860 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
861 uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
862 uint64_t b= AV_RN64(pixels+1);\ |
385 | 863 l1= (a&0x0303030303030303ULL)\ |
864 + (b&0x0303030303030303ULL);\ | |
865 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
866 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
867 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
868 pixels+=line_size;\ | |
869 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
870 a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
871 b= AV_RN64(pixels+1);\ |
385 | 872 l0= (a&0x0303030303030303ULL)\ |
873 + (b&0x0303030303030303ULL)\ | |
874 + 0x0101010101010101ULL;\ | |
875 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
876 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
877 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
878 pixels+=line_size;\ | |
879 block +=line_size;\ | |
880 }\ | |
881 }\ | |
882 \ | |
859 | 883 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\ |
884 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\ | |
885 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\ | |
886 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\ | |
887 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\ | |
888 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\ | |
889 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8) | |
385 | 890 |
891 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) ) | |
892 #else // 64 bit variant | |
893 | |
894 #define PIXOP2(OPNAME, OP) \ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
895 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
896 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
897 for(i=0; i<h; i++){\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
898 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
899 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
900 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
901 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
902 }\ |
1168 | 903 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
904 int i;\ | |
905 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
906 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ |
1168 | 907 pixels+=line_size;\ |
908 block +=line_size;\ | |
909 }\ | |
910 }\ | |
859 | 911 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
385 | 912 int i;\ |
913 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
914 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
915 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\ |
385 | 916 pixels+=line_size;\ |
917 block +=line_size;\ | |
918 }\ | |
919 }\ | |
859 | 920 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
921 OPNAME ## _pixels8_c(block, pixels, line_size, h);\ | |
651 | 922 }\ |
385 | 923 \ |
651 | 924 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
925 int src_stride1, int src_stride2, int h){\ | |
385 | 926 int i;\ |
927 for(i=0; i<h; i++){\ | |
651 | 928 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
929 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
930 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 931 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
932 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
933 b= AV_RN32(&src2[i*src_stride2+4]);\ |
1264 | 934 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\ |
385 | 935 }\ |
936 }\ | |
937 \ | |
651 | 938 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
939 int src_stride1, int src_stride2, int h){\ | |
385 | 940 int i;\ |
941 for(i=0; i<h; i++){\ | |
651 | 942 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
943 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
944 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 945 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
946 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
947 b= AV_RN32(&src2[i*src_stride2+4]);\ |
1264 | 948 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ |
385 | 949 }\ |
950 }\ | |
951 \ | |
1168 | 952 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
953 int src_stride1, int src_stride2, int h){\ | |
954 int i;\ | |
955 for(i=0; i<h; i++){\ | |
956 uint32_t a,b;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
957 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
958 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 959 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
1168 | 960 }\ |
961 }\ | |
962 \ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
963 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
964 int src_stride1, int src_stride2, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
965 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
966 for(i=0; i<h; i++){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
967 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
968 a= AV_RN16(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
969 b= AV_RN16(&src2[i*src_stride2 ]);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
970 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
971 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
972 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
973 \ |
651 | 974 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
975 int src_stride1, int src_stride2, int h){\ | |
976 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ | |
977 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ | |
978 }\ | |
979 \ | |
980 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ | |
981 int src_stride1, int src_stride2, int h){\ | |
982 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ | |
983 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ | |
984 }\ | |
985 \ | |
859 | 986 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 987 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
988 }\ | |
989 \ | |
859 | 990 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 991 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
992 }\ | |
993 \ | |
859 | 994 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 995 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
996 }\ | |
997 \ | |
859 | 998 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 999 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
385 | 1000 }\ |
1001 \ | |
651 | 1002 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1003 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1004 int i;\ | |
1005 for(i=0; i<h; i++){\ | |
1006 uint32_t a, b, c, d, l0, l1, h0, h1;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1007 a= AV_RN32(&src1[i*src_stride1]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1008 b= AV_RN32(&src2[i*src_stride2]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1009 c= AV_RN32(&src3[i*src_stride3]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1010 d= AV_RN32(&src4[i*src_stride4]);\ |
651 | 1011 l0= (a&0x03030303UL)\ |
1012 + (b&0x03030303UL)\ | |
1013 + 0x02020202UL;\ | |
1014 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1015 + ((b&0xFCFCFCFCUL)>>2);\ | |
1016 l1= (c&0x03030303UL)\ | |
1017 + (d&0x03030303UL);\ | |
1018 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1019 + ((d&0xFCFCFCFCUL)>>2);\ | |
1020 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1021 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1022 b= AV_RN32(&src2[i*src_stride2+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1023 c= AV_RN32(&src3[i*src_stride3+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1024 d= AV_RN32(&src4[i*src_stride4+4]);\ |
651 | 1025 l0= (a&0x03030303UL)\ |
1026 + (b&0x03030303UL)\ | |
1027 + 0x02020202UL;\ | |
1028 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1029 + ((b&0xFCFCFCFCUL)>>2);\ | |
1030 l1= (c&0x03030303UL)\ | |
1031 + (d&0x03030303UL);\ | |
1032 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1033 + ((d&0xFCFCFCFCUL)>>2);\ | |
1034 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1035 }\ | |
1036 }\ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1037 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1038 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1039 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1040 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1041 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1042 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1043 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1044 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1045 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1046 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1047 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1048 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1049 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1050 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1051 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1052 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1053 \ |
651 | 1054 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1055 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
385 | 1056 int i;\ |
1057 for(i=0; i<h; i++){\ | |
651 | 1058 uint32_t a, b, c, d, l0, l1, h0, h1;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1059 a= AV_RN32(&src1[i*src_stride1]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1060 b= AV_RN32(&src2[i*src_stride2]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1061 c= AV_RN32(&src3[i*src_stride3]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1062 d= AV_RN32(&src4[i*src_stride4]);\ |
651 | 1063 l0= (a&0x03030303UL)\ |
1064 + (b&0x03030303UL)\ | |
1065 + 0x01010101UL;\ | |
1066 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1067 + ((b&0xFCFCFCFCUL)>>2);\ | |
1068 l1= (c&0x03030303UL)\ | |
1069 + (d&0x03030303UL);\ | |
1070 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1071 + ((d&0xFCFCFCFCUL)>>2);\ | |
1072 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1073 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1074 b= AV_RN32(&src2[i*src_stride2+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1075 c= AV_RN32(&src3[i*src_stride3+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1076 d= AV_RN32(&src4[i*src_stride4+4]);\ |
651 | 1077 l0= (a&0x03030303UL)\ |
1078 + (b&0x03030303UL)\ | |
1079 + 0x01010101UL;\ | |
1080 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1081 + ((b&0xFCFCFCFCUL)>>2);\ | |
1082 l1= (c&0x03030303UL)\ | |
1083 + (d&0x03030303UL);\ | |
1084 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1085 + ((d&0xFCFCFCFCUL)>>2);\ | |
1086 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
385 | 1087 }\ |
1088 }\ | |
651 | 1089 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1090 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1091 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1092 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1093 }\ | |
1094 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ | |
1095 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1096 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1097 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1098 }\ | |
385 | 1099 \ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1100 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1101 {\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1102 int i, a0, b0, a1, b1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1103 a0= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1104 b0= pixels[1] + 2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1105 a0 += b0;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1106 b0 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1107 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1108 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1109 for(i=0; i<h; i+=2){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1110 a1= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1111 b1= pixels[1];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1112 a1 += b1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1113 b1 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1114 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1115 block[0]= (a1+a0)>>2; /* FIXME non put */\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1116 block[1]= (b1+b0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1117 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1118 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1119 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1120 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1121 a0= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1122 b0= pixels[1] + 2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1123 a0 += b0;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1124 b0 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1125 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1126 block[0]= (a1+a0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1127 block[1]= (b1+b0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1128 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1129 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1130 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1131 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1132 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1133 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1134 {\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1135 int i;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1136 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1137 const uint32_t b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1138 uint32_t l0= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1139 + (b&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1140 + 0x02020202UL;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1141 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1142 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1143 uint32_t l1,h1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1144 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1145 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1146 for(i=0; i<h; i+=2){\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1147 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1148 uint32_t b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1149 l1= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1150 + (b&0x03030303UL);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1151 h1= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1152 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1153 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1154 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1155 block +=line_size;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1156 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1157 b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1158 l0= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1159 + (b&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1160 + 0x02020202UL;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1161 h0= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1162 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1163 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1164 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1165 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1166 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1167 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1168 \ |
859 | 1169 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 1170 {\ |
1171 int j;\ | |
1172 for(j=0; j<2; j++){\ | |
1173 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1174 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1175 const uint32_t b= AV_RN32(pixels+1);\ |
385 | 1176 uint32_t l0= (a&0x03030303UL)\ |
1177 + (b&0x03030303UL)\ | |
1178 + 0x02020202UL;\ | |
1179 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1180 + ((b&0xFCFCFCFCUL)>>2);\ | |
1181 uint32_t l1,h1;\ | |
1182 \ | |
1183 pixels+=line_size;\ | |
1184 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1185 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1186 uint32_t b= AV_RN32(pixels+1);\ |
385 | 1187 l1= (a&0x03030303UL)\ |
1188 + (b&0x03030303UL);\ | |
1189 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
1190 + ((b&0xFCFCFCFCUL)>>2);\ | |
1191 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1192 pixels+=line_size;\ | |
1193 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1194 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1195 b= AV_RN32(pixels+1);\ |
385 | 1196 l0= (a&0x03030303UL)\ |
1197 + (b&0x03030303UL)\ | |
1198 + 0x02020202UL;\ | |
1199 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1200 + ((b&0xFCFCFCFCUL)>>2);\ | |
1201 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1202 pixels+=line_size;\ | |
1203 block +=line_size;\ | |
1204 }\ | |
1205 pixels+=4-line_size*(h+1);\ | |
1206 block +=4-line_size*h;\ | |
1207 }\ | |
1208 }\ | |
1209 \ | |
859 | 1210 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 1211 {\ |
1212 int j;\ | |
1213 for(j=0; j<2; j++){\ | |
1214 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1215 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1216 const uint32_t b= AV_RN32(pixels+1);\ |
385 | 1217 uint32_t l0= (a&0x03030303UL)\ |
1218 + (b&0x03030303UL)\ | |
1219 + 0x01010101UL;\ | |
1220 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1221 + ((b&0xFCFCFCFCUL)>>2);\ | |
1222 uint32_t l1,h1;\ | |
1223 \ | |
1224 pixels+=line_size;\ | |
1225 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1226 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1227 uint32_t b= AV_RN32(pixels+1);\ |
385 | 1228 l1= (a&0x03030303UL)\ |
1229 + (b&0x03030303UL);\ | |
1230 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
1231 + ((b&0xFCFCFCFCUL)>>2);\ | |
1232 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1233 pixels+=line_size;\ | |
1234 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1235 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1236 b= AV_RN32(pixels+1);\ |
385 | 1237 l0= (a&0x03030303UL)\ |
1238 + (b&0x03030303UL)\ | |
1239 + 0x01010101UL;\ | |
1240 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1241 + ((b&0xFCFCFCFCUL)>>2);\ | |
1242 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1243 pixels+=line_size;\ | |
1244 block +=line_size;\ | |
1245 }\ | |
1246 pixels+=4-line_size*(h+1);\ | |
1247 block +=4-line_size*h;\ | |
1248 }\ | |
1249 }\ | |
1250 \ | |
859 | 1251 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ |
1252 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\ | |
1253 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\ | |
1254 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\ | |
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\ | |
1256 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\ | |
1257 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\ | |
1258 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\ | |
651 | 1259 |
1264 | 1260 #define op_avg(a, b) a = rnd_avg32(a, b) |
385 | 1261 #endif |
1262 #define op_put(a, b) a = b | |
1263 | |
1264 PIXOP2(avg, op_avg) | |
1265 PIXOP2(put, op_put) | |
1266 #undef op_avg | |
1267 #undef op_put | |
1268 | |
0 | 1269 #define avg2(a,b) ((a+b+1)>>1) |
1270 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) | |
1271 | |
1864 | 1272 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){ |
1273 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h); | |
1274 } | |
1275 | |
1276 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){ | |
1277 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h); | |
1278 } | |
753 | 1279 |
1064 | 1280 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder) |
255 | 1281 { |
1282 const int A=(16-x16)*(16-y16); | |
1283 const int B=( x16)*(16-y16); | |
1284 const int C=(16-x16)*( y16); | |
1285 const int D=( x16)*( y16); | |
1286 int i; | |
1287 | |
1288 for(i=0; i<h; i++) | |
1289 { | |
651 | 1290 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8; |
1291 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8; | |
1292 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8; | |
1293 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8; | |
1294 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8; | |
1295 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8; | |
1296 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8; | |
1297 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8; | |
1298 dst+= stride; | |
1299 src+= stride; | |
255 | 1300 } |
1301 } | |
1302 | |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
1303 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
753 | 1304 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) |
1305 { | |
1306 int y, vx, vy; | |
1307 const int s= 1<<shift; | |
2967 | 1308 |
753 | 1309 width--; |
1310 height--; | |
1311 | |
1312 for(y=0; y<h; y++){ | |
1313 int x; | |
1314 | |
1315 vx= ox; | |
1316 vy= oy; | |
1317 for(x=0; x<8; x++){ //XXX FIXME optimize | |
1318 int src_x, src_y, frac_x, frac_y, index; | |
1319 | |
1320 src_x= vx>>16; | |
1321 src_y= vy>>16; | |
1322 frac_x= src_x&(s-1); | |
1323 frac_y= src_y&(s-1); | |
1324 src_x>>=shift; | |
1325 src_y>>=shift; | |
2967 | 1326 |
753 | 1327 if((unsigned)src_x < width){ |
1328 if((unsigned)src_y < height){ | |
1329 index= src_x + src_y*stride; | |
1330 dst[y*stride + x]= ( ( src[index ]*(s-frac_x) | |
1331 + src[index +1]* frac_x )*(s-frac_y) | |
1332 + ( src[index+stride ]*(s-frac_x) | |
1333 + src[index+stride+1]* frac_x )* frac_y | |
1334 + r)>>(shift*2); | |
1335 }else{ | |
4594 | 1336 index= src_x + av_clip(src_y, 0, height)*stride; |
2967 | 1337 dst[y*stride + x]= ( ( src[index ]*(s-frac_x) |
753 | 1338 + src[index +1]* frac_x )*s |
1339 + r)>>(shift*2); | |
1340 } | |
1341 }else{ | |
1342 if((unsigned)src_y < height){ | |
4594 | 1343 index= av_clip(src_x, 0, width) + src_y*stride; |
2967 | 1344 dst[y*stride + x]= ( ( src[index ]*(s-frac_y) |
753 | 1345 + src[index+stride ]* frac_y )*s |
1346 + r)>>(shift*2); | |
1347 }else{ | |
4594 | 1348 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride; |
753 | 1349 dst[y*stride + x]= src[index ]; |
1350 } | |
1351 } | |
2967 | 1352 |
753 | 1353 vx+= dxx; |
1354 vy+= dyx; | |
1355 } | |
1356 ox += dxy; | |
1357 oy += dyy; | |
1358 } | |
1359 } | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1360 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1361 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1362 switch(width){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1363 case 2: put_pixels2_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1364 case 4: put_pixels4_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1365 case 8: put_pixels8_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1366 case 16:put_pixels16_c(dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1367 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1368 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1369 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1370 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1371 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1372 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1373 for (j=0; j < width; j++) { |
2979 | 1374 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1375 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1376 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1377 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1378 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1379 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1380 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1381 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1382 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1383 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1384 for (j=0; j < width; j++) { |
2979 | 1385 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1386 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1387 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1388 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1389 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1390 } |
2967 | 1391 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1392 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1393 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1394 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1395 for (j=0; j < width; j++) { |
2979 | 1396 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1397 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1398 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1399 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1400 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1401 } |
2967 | 1402 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1403 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1404 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1405 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1406 for (j=0; j < width; j++) { |
2979 | 1407 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1408 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1409 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1410 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1411 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1412 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1413 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1414 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1415 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1416 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1417 for (j=0; j < width; j++) { |
2979 | 1418 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1419 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1420 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1421 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1422 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1423 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1424 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1425 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1426 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1427 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1428 for (j=0; j < width; j++) { |
2979 | 1429 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1430 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1431 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1432 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1433 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1434 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1435 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1436 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1437 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1438 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1439 for (j=0; j < width; j++) { |
2979 | 1440 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1441 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1442 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1443 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1444 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1445 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1446 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1447 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1448 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1449 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1450 for (j=0; j < width; j++) { |
2979 | 1451 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1452 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1453 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1454 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1455 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1456 } |
1319 | 1457 |
1458 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1459 switch(width){ | |
1460 case 2: avg_pixels2_c (dst, src, stride, height); break; | |
1461 case 4: avg_pixels4_c (dst, src, stride, height); break; | |
1462 case 8: avg_pixels8_c (dst, src, stride, height); break; | |
1463 case 16:avg_pixels16_c(dst, src, stride, height); break; | |
1464 } | |
1465 } | |
1466 | |
1467 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1468 int i,j; | |
1469 for (i=0; i < height; i++) { | |
1470 for (j=0; j < width; j++) { | |
2979 | 1471 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1; |
1319 | 1472 } |
1473 src += stride; | |
1474 dst += stride; | |
1475 } | |
1476 } | |
1477 | |
1478 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1479 int i,j; | |
1480 for (i=0; i < height; i++) { | |
1481 for (j=0; j < width; j++) { | |
2979 | 1482 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1; |
1319 | 1483 } |
1484 src += stride; | |
1485 dst += stride; | |
1486 } | |
1487 } | |
2967 | 1488 |
1319 | 1489 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
1490 int i,j; | |
1491 for (i=0; i < height; i++) { | |
1492 for (j=0; j < width; j++) { | |
2979 | 1493 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1; |
1319 | 1494 } |
1495 src += stride; | |
1496 dst += stride; | |
1497 } | |
1498 } | |
2967 | 1499 |
1319 | 1500 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
1501 int i,j; | |
1502 for (i=0; i < height; i++) { | |
1503 for (j=0; j < width; j++) { | |
2979 | 1504 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1505 } |
1506 src += stride; | |
1507 dst += stride; | |
1508 } | |
1509 } | |
1510 | |
1511 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1512 int i,j; | |
1513 for (i=0; i < height; i++) { | |
1514 for (j=0; j < width; j++) { | |
2979 | 1515 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1516 } |
1517 src += stride; | |
1518 dst += stride; | |
1519 } | |
1520 } | |
1521 | |
1522 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1523 int i,j; | |
1524 for (i=0; i < height; i++) { | |
1525 for (j=0; j < width; j++) { | |
2979 | 1526 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1; |
1319 | 1527 } |
1528 src += stride; | |
1529 dst += stride; | |
1530 } | |
1531 } | |
1532 | |
1533 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1534 int i,j; | |
1535 for (i=0; i < height; i++) { | |
1536 for (j=0; j < width; j++) { | |
2979 | 1537 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1538 } |
1539 src += stride; | |
1540 dst += stride; | |
1541 } | |
1542 } | |
1543 | |
1544 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1545 int i,j; | |
1546 for (i=0; i < height; i++) { | |
1547 for (j=0; j < width; j++) { | |
2979 | 1548 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1549 } |
1550 src += stride; | |
1551 dst += stride; | |
1552 } | |
1553 } | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1554 #if 0 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1555 #define TPEL_WIDTH(width)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1556 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1557 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1558 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1559 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1560 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1561 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1562 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1563 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1564 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1565 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1566 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1567 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1568 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1569 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1570 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1571 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1572 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1573 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);} |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1574 #endif |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1575 |
1168 | 1576 #define H264_CHROMA_MC(OPNAME, OP)\ |
1577 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1578 const int A=(8-x)*(8-y);\ | |
1579 const int B=( x)*(8-y);\ | |
1580 const int C=(8-x)*( y);\ | |
1581 const int D=( x)*( y);\ | |
1582 int i;\ | |
1583 \ | |
1584 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1585 \ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1586 if(D){\ |
6054 | 1587 for(i=0; i<h; i++){\ |
6053 | 1588 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1589 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1590 dst+= stride;\ | |
1591 src+= stride;\ | |
1592 }\ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1593 }else{\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1594 const int E= B+C;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1595 const int step= C ? stride : 1;\ |
6054 | 1596 for(i=0; i<h; i++){\ |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1597 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1598 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1599 dst+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1600 src+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1601 }\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1602 }\ |
1168 | 1603 }\ |
1604 \ | |
1605 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1606 const int A=(8-x)*(8-y);\ | |
1607 const int B=( x)*(8-y);\ | |
1608 const int C=(8-x)*( y);\ | |
1609 const int D=( x)*( y);\ | |
1610 int i;\ | |
1611 \ | |
1612 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1613 \ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1614 if(D){\ |
6054 | 1615 for(i=0; i<h; i++){\ |
6053 | 1616 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1617 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1618 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ | |
1619 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ | |
1620 dst+= stride;\ | |
1621 src+= stride;\ | |
1622 }\ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1623 }else{\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1624 const int E= B+C;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1625 const int step= C ? stride : 1;\ |
6054 | 1626 for(i=0; i<h; i++){\ |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1627 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1628 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1629 OP(dst[2], (A*src[2] + E*src[step+2]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1630 OP(dst[3], (A*src[3] + E*src[step+3]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1631 dst+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1632 src+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1633 }\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1634 }\ |
1168 | 1635 }\ |
1636 \ | |
1637 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1638 const int A=(8-x)*(8-y);\ | |
1639 const int B=( x)*(8-y);\ | |
1640 const int C=(8-x)*( y);\ | |
1641 const int D=( x)*( y);\ | |
1642 int i;\ | |
1643 \ | |
1644 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1645 \ | |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1646 if(D){\ |
6054 | 1647 for(i=0; i<h; i++){\ |
6053 | 1648 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1649 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1650 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ | |
1651 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ | |
1652 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\ | |
1653 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\ | |
1654 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\ | |
1655 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\ | |
1656 dst+= stride;\ | |
1657 src+= stride;\ | |
1658 }\ | |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1659 }else{\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1660 const int E= B+C;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1661 const int step= C ? stride : 1;\ |
6054 | 1662 for(i=0; i<h; i++){\ |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1663 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1664 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1665 OP(dst[2], (A*src[2] + E*src[step+2]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1666 OP(dst[3], (A*src[3] + E*src[step+3]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1667 OP(dst[4], (A*src[4] + E*src[step+4]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1668 OP(dst[5], (A*src[5] + E*src[step+5]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1669 OP(dst[6], (A*src[6] + E*src[step+6]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1670 OP(dst[7], (A*src[7] + E*src[step+7]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1671 dst+= stride;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1672 src+= stride;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1673 }\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1674 }\ |
1168 | 1675 } |
1676 | |
1677 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1) | |
1678 #define op_put(a, b) a = (((b) + 32)>>6) | |
1679 | |
1680 H264_CHROMA_MC(put_ , op_put) | |
1681 H264_CHROMA_MC(avg_ , op_avg) | |
1682 #undef op_avg | |
1683 #undef op_put | |
1684 | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9437
diff
changeset
|
1685 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ |
3663 | 1686 const int A=(8-x)*(8-y); |
1687 const int B=( x)*(8-y); | |
1688 const int C=(8-x)*( y); | |
1689 const int D=( x)*( y); | |
1690 int i; | |
1691 | |
1692 assert(x<8 && y<8 && x>=0 && y>=0); | |
1693 | |
1694 for(i=0; i<h; i++) | |
1695 { | |
1696 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6; | |
1697 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6; | |
1698 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6; | |
1699 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6; | |
1700 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6; | |
1701 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6; | |
1702 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6; | |
1703 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6; | |
1704 dst+= stride; | |
1705 src+= stride; | |
1706 } | |
1707 } | |
1708 | |
9440 | 1709 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ |
1710 const int A=(8-x)*(8-y); | |
1711 const int B=( x)*(8-y); | |
1712 const int C=(8-x)*( y); | |
1713 const int D=( x)*( y); | |
1714 int i; | |
1715 | |
1716 assert(x<8 && y<8 && x>=0 && y>=0); | |
1717 | |
1718 for(i=0; i<h; i++) | |
1719 { | |
1720 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6)); | |
1721 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6)); | |
1722 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6)); | |
1723 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6)); | |
1724 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6)); | |
1725 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6)); | |
1726 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6)); | |
1727 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6)); | |
1728 dst+= stride; | |
1729 src+= stride; | |
1730 } | |
1731 } | |
1732 | |
651 | 1733 #define QPEL_MC(r, OPNAME, RND, OP) \ |
1064 | 1734 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
4176 | 1735 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1736 int i;\ |
1737 for(i=0; i<h; i++)\ | |
1738 {\ | |
1739 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\ | |
1740 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\ | |
1741 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\ | |
1742 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\ | |
1743 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\ | |
1744 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\ | |
1745 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\ | |
1746 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\ | |
1747 dst+=dstStride;\ | |
1748 src+=srcStride;\ | |
1749 }\ | |
1750 }\ | |
1751 \ | |
1064 | 1752 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
984 | 1753 const int w=8;\ |
4176 | 1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1755 int i;\ |
1756 for(i=0; i<w; i++)\ | |
1757 {\ | |
1758 const int src0= src[0*srcStride];\ | |
1759 const int src1= src[1*srcStride];\ | |
1760 const int src2= src[2*srcStride];\ | |
1761 const int src3= src[3*srcStride];\ | |
1762 const int src4= src[4*srcStride];\ | |
1763 const int src5= src[5*srcStride];\ | |
1764 const int src6= src[6*srcStride];\ | |
1765 const int src7= src[7*srcStride];\ | |
1766 const int src8= src[8*srcStride];\ | |
1767 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\ | |
1768 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\ | |
1769 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\ | |
1770 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\ | |
1771 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\ | |
1772 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\ | |
1773 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\ | |
1774 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\ | |
1775 dst++;\ | |
1776 src++;\ | |
1777 }\ | |
1778 }\ | |
1779 \ | |
1064 | 1780 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
4176 | 1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1782 int i;\ |
954 | 1783 \ |
651 | 1784 for(i=0; i<h; i++)\ |
1785 {\ | |
1786 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\ | |
1787 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\ | |
1788 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\ | |
1789 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\ | |
1790 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\ | |
1791 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\ | |
1792 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\ | |
1793 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\ | |
1794 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\ | |
1795 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\ | |
1796 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\ | |
1797 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\ | |
1798 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\ | |
1799 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\ | |
1800 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\ | |
1801 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\ | |
1802 dst+=dstStride;\ | |
1803 src+=srcStride;\ | |
1804 }\ | |
255 | 1805 }\ |
1806 \ | |
1064 | 1807 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
4176 | 1808 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1809 int i;\ |
954 | 1810 const int w=16;\ |
651 | 1811 for(i=0; i<w; i++)\ |
1812 {\ | |
1813 const int src0= src[0*srcStride];\ | |
1814 const int src1= src[1*srcStride];\ | |
1815 const int src2= src[2*srcStride];\ | |
1816 const int src3= src[3*srcStride];\ | |
1817 const int src4= src[4*srcStride];\ | |
1818 const int src5= src[5*srcStride];\ | |
1819 const int src6= src[6*srcStride];\ | |
1820 const int src7= src[7*srcStride];\ | |
1821 const int src8= src[8*srcStride];\ | |
1822 const int src9= src[9*srcStride];\ | |
1823 const int src10= src[10*srcStride];\ | |
1824 const int src11= src[11*srcStride];\ | |
1825 const int src12= src[12*srcStride];\ | |
1826 const int src13= src[13*srcStride];\ | |
1827 const int src14= src[14*srcStride];\ | |
1828 const int src15= src[15*srcStride];\ | |
1829 const int src16= src[16*srcStride];\ | |
1830 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\ | |
1831 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\ | |
1832 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\ | |
1833 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\ | |
1834 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\ | |
1835 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\ | |
1836 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\ | |
1837 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\ | |
1838 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\ | |
1839 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\ | |
1840 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\ | |
1841 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\ | |
1842 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\ | |
1843 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\ | |
1844 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\ | |
1845 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\ | |
1846 dst++;\ | |
1847 src++;\ | |
1848 }\ | |
255 | 1849 }\ |
1850 \ | |
1064 | 1851 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ |
859 | 1852 OPNAME ## pixels8_c(dst, src, stride, 8);\ |
255 | 1853 }\ |
1854 \ | |
1064 | 1855 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ |
1856 uint8_t half[64];\ | |
651 | 1857 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ |
1858 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\ | |
1859 }\ | |
1860 \ | |
1064 | 1861 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\ |
651 | 1862 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\ |
255 | 1863 }\ |
1864 \ | |
1064 | 1865 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\ |
1866 uint8_t half[64];\ | |
651 | 1867 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ |
1868 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\ | |
1869 }\ | |
1870 \ | |
1064 | 1871 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\ |
1872 uint8_t full[16*9];\ | |
1873 uint8_t half[64];\ | |
651 | 1874 copy_block9(full, src, 16, stride, 9);\ |
984 | 1875 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\ |
651 | 1876 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\ |
1877 }\ | |
1878 \ | |
1064 | 1879 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\ |
1880 uint8_t full[16*9];\ | |
651 | 1881 copy_block9(full, src, 16, stride, 9);\ |
984 | 1882 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\ |
255 | 1883 }\ |
1884 \ | |
1064 | 1885 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\ |
1886 uint8_t full[16*9];\ | |
1887 uint8_t half[64];\ | |
651 | 1888 copy_block9(full, src, 16, stride, 9);\ |
984 | 1889 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\ |
651 | 1890 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\ |
1891 }\ | |
1064 | 1892 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1893 uint8_t full[16*9];\ | |
1894 uint8_t halfH[72];\ | |
1895 uint8_t halfV[64];\ | |
1896 uint8_t halfHV[64];\ | |
651 | 1897 copy_block9(full, src, 16, stride, 9);\ |
1898 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1899 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1900 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1901 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1902 }\ |
1064 | 1903 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\ |
1904 uint8_t full[16*9];\ | |
1905 uint8_t halfH[72];\ | |
1906 uint8_t halfHV[64];\ | |
984 | 1907 copy_block9(full, src, 16, stride, 9);\ |
1908 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1909 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1910 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1911 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ | |
1912 }\ | |
1064 | 1913 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1914 uint8_t full[16*9];\ | |
1915 uint8_t halfH[72];\ | |
1916 uint8_t halfV[64];\ | |
1917 uint8_t halfHV[64];\ | |
651 | 1918 copy_block9(full, src, 16, stride, 9);\ |
1919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1920 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1921 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1922 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1923 }\ |
1064 | 1924 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\ |
1925 uint8_t full[16*9];\ | |
1926 uint8_t halfH[72];\ | |
1927 uint8_t halfHV[64];\ | |
984 | 1928 copy_block9(full, src, 16, stride, 9);\ |
1929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1930 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1931 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1932 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ | |
1933 }\ | |
1064 | 1934 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1935 uint8_t full[16*9];\ | |
1936 uint8_t halfH[72];\ | |
1937 uint8_t halfV[64];\ | |
1938 uint8_t halfHV[64];\ | |
651 | 1939 copy_block9(full, src, 16, stride, 9);\ |
1940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1941 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1942 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1943 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
1944 }\ | |
1064 | 1945 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\ |
1946 uint8_t full[16*9];\ | |
1947 uint8_t halfH[72];\ | |
1948 uint8_t halfHV[64];\ | |
984 | 1949 copy_block9(full, src, 16, stride, 9);\ |
1950 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1951 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1953 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ | |
1954 }\ | |
1064 | 1955 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1956 uint8_t full[16*9];\ | |
1957 uint8_t halfH[72];\ | |
1958 uint8_t halfV[64];\ | |
1959 uint8_t halfHV[64];\ | |
651 | 1960 copy_block9(full, src, 16, stride, 9);\ |
1961 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\ | |
984 | 1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1964 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1965 }\ |
1064 | 1966 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\ |
1967 uint8_t full[16*9];\ | |
1968 uint8_t halfH[72];\ | |
1969 uint8_t halfHV[64];\ | |
984 | 1970 copy_block9(full, src, 16, stride, 9);\ |
1971 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1972 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1973 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1974 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ | |
1975 }\ | |
1064 | 1976 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\ |
1977 uint8_t halfH[72];\ | |
1978 uint8_t halfHV[64];\ | |
651 | 1979 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1980 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ |
651 | 1981 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ |
1982 }\ | |
1064 | 1983 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\ |
1984 uint8_t halfH[72];\ | |
1985 uint8_t halfHV[64];\ | |
651 | 1986 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1987 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ |
651 | 1988 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ |
1989 }\ | |
1064 | 1990 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1991 uint8_t full[16*9];\ | |
1992 uint8_t halfH[72];\ | |
1993 uint8_t halfV[64];\ | |
1994 uint8_t halfHV[64];\ | |
651 | 1995 copy_block9(full, src, 16, stride, 9);\ |
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1997 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1999 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\ |
255 | 2000 }\ |
1064 | 2001 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\ |
2002 uint8_t full[16*9];\ | |
2003 uint8_t halfH[72];\ | |
984 | 2004 copy_block9(full, src, 16, stride, 9);\ |
2005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
2006 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
2007 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ | |
2008 }\ | |
1064 | 2009 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2010 uint8_t full[16*9];\ | |
2011 uint8_t halfH[72];\ | |
2012 uint8_t halfV[64];\ | |
2013 uint8_t halfHV[64];\ | |
651 | 2014 copy_block9(full, src, 16, stride, 9);\ |
2015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
2017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 2018 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\ |
2019 }\ | |
1064 | 2020 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\ |
2021 uint8_t full[16*9];\ | |
2022 uint8_t halfH[72];\ | |
984 | 2023 copy_block9(full, src, 16, stride, 9);\ |
2024 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
2025 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
2026 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ | |
2027 }\ | |
1064 | 2028 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ |
2029 uint8_t halfH[72];\ | |
651 | 2030 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 2031 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ |
651 | 2032 }\ |
1064 | 2033 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ |
859 | 2034 OPNAME ## pixels16_c(dst, src, stride, 16);\ |
255 | 2035 }\ |
651 | 2036 \ |
1064 | 2037 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ |
2038 uint8_t half[256];\ | |
651 | 2039 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\ |
2040 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\ | |
2041 }\ | |
2042 \ | |
1064 | 2043 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\ |
651 | 2044 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\ |
2045 }\ | |
2046 \ | |
1064 | 2047 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\ |
2048 uint8_t half[256];\ | |
651 | 2049 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\ |
2050 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\ | |
2051 }\ | |
2052 \ | |
1064 | 2053 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\ |
2054 uint8_t full[24*17];\ | |
2055 uint8_t half[256];\ | |
651 | 2056 copy_block17(full, src, 24, stride, 17);\ |
954 | 2057 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\ |
651 | 2058 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\ |
255 | 2059 }\ |
651 | 2060 \ |
1064 | 2061 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\ |
2062 uint8_t full[24*17];\ | |
651 | 2063 copy_block17(full, src, 24, stride, 17);\ |
954 | 2064 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\ |
651 | 2065 }\ |
2066 \ | |
1064 | 2067 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\ |
2068 uint8_t full[24*17];\ | |
2069 uint8_t half[256];\ | |
651 | 2070 copy_block17(full, src, 24, stride, 17);\ |
954 | 2071 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\ |
651 | 2072 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\ |
255 | 2073 }\ |
1064 | 2074 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2075 uint8_t full[24*17];\ | |
2076 uint8_t halfH[272];\ | |
2077 uint8_t halfV[256];\ | |
2078 uint8_t halfHV[256];\ | |
651 | 2079 copy_block17(full, src, 24, stride, 17);\ |
2080 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2081 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2082 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2083 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2084 }\ | |
1064 | 2085 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\ |
2086 uint8_t full[24*17];\ | |
2087 uint8_t halfH[272];\ | |
2088 uint8_t halfHV[256];\ | |
984 | 2089 copy_block17(full, src, 24, stride, 17);\ |
2090 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2091 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2092 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2093 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ | |
2094 }\ | |
1064 | 2095 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2096 uint8_t full[24*17];\ | |
2097 uint8_t halfH[272];\ | |
2098 uint8_t halfV[256];\ | |
2099 uint8_t halfHV[256];\ | |
651 | 2100 copy_block17(full, src, 24, stride, 17);\ |
2101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2102 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2103 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2104 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2105 }\ | |
1064 | 2106 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\ |
2107 uint8_t full[24*17];\ | |
2108 uint8_t halfH[272];\ | |
2109 uint8_t halfHV[256];\ | |
984 | 2110 copy_block17(full, src, 24, stride, 17);\ |
2111 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2112 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2113 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2114 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ | |
2115 }\ | |
1064 | 2116 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2117 uint8_t full[24*17];\ | |
2118 uint8_t halfH[272];\ | |
2119 uint8_t halfV[256];\ | |
2120 uint8_t halfHV[256];\ | |
651 | 2121 copy_block17(full, src, 24, stride, 17);\ |
2122 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2123 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2124 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2125 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
255 | 2126 }\ |
1064 | 2127 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\ |
2128 uint8_t full[24*17];\ | |
2129 uint8_t halfH[272];\ | |
2130 uint8_t halfHV[256];\ | |
984 | 2131 copy_block17(full, src, 24, stride, 17);\ |
2132 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2133 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2134 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2135 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ | |
2136 }\ | |
1064 | 2137 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2138 uint8_t full[24*17];\ | |
2139 uint8_t halfH[272];\ | |
2140 uint8_t halfV[256];\ | |
2141 uint8_t halfHV[256];\ | |
651 | 2142 copy_block17(full, src, 24, stride, 17);\ |
2143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\ | |
954 | 2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2146 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2147 }\ | |
1064 | 2148 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\ |
2149 uint8_t full[24*17];\ | |
2150 uint8_t halfH[272];\ | |
2151 uint8_t halfHV[256];\ | |
984 | 2152 copy_block17(full, src, 24, stride, 17);\ |
2153 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2154 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2155 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2156 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ | |
2157 }\ | |
1064 | 2158 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\ |
2159 uint8_t halfH[272];\ | |
2160 uint8_t halfHV[256];\ | |
651 | 2161 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2162 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ |
651 | 2163 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ |
255 | 2164 }\ |
1064 | 2165 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\ |
2166 uint8_t halfH[272];\ | |
2167 uint8_t halfHV[256];\ | |
651 | 2168 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2169 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ |
651 | 2170 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ |
2171 }\ | |
1064 | 2172 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2173 uint8_t full[24*17];\ | |
2174 uint8_t halfH[272];\ | |
2175 uint8_t halfV[256];\ | |
2176 uint8_t halfHV[256];\ | |
651 | 2177 copy_block17(full, src, 24, stride, 17);\ |
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2179 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2180 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2181 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\ |
255 | 2182 }\ |
1064 | 2183 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\ |
2184 uint8_t full[24*17];\ | |
2185 uint8_t halfH[272];\ | |
984 | 2186 copy_block17(full, src, 24, stride, 17);\ |
2187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2188 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2189 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ | |
2190 }\ | |
1064 | 2191 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2192 uint8_t full[24*17];\ | |
2193 uint8_t halfH[272];\ | |
2194 uint8_t halfV[256];\ | |
2195 uint8_t halfHV[256];\ | |
651 | 2196 copy_block17(full, src, 24, stride, 17);\ |
2197 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2199 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2200 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\ |
2201 }\ | |
1064 | 2202 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\ |
2203 uint8_t full[24*17];\ | |
2204 uint8_t halfH[272];\ | |
984 | 2205 copy_block17(full, src, 24, stride, 17);\ |
2206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2207 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2208 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ | |
2209 }\ | |
1064 | 2210 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ |
2211 uint8_t halfH[272];\ | |
651 | 2212 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2213 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ |
859 | 2214 } |
255 | 2215 |
651 | 2216 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) |
2217 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1) | |
2218 #define op_put(a, b) a = cm[((b) + 16)>>5] | |
2219 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5] | |
2220 | |
2221 QPEL_MC(0, put_ , _ , op_put) | |
2222 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd) | |
2223 QPEL_MC(0, avg_ , _ , op_avg) | |
2224 //QPEL_MC(1, avg_no_rnd , _ , op_avg) | |
2225 #undef op_avg | |
2226 #undef op_avg_no_rnd | |
2227 #undef op_put | |
2228 #undef op_put_no_rnd | |
255 | 2229 |
1168 | 2230 #if 1 |
2231 #define H264_LOWPASS(OPNAME, OP, OP2) \ | |
5151 | 2232 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2233 const int h=2;\ |
4176 | 2234 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2235 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2236 for(i=0; i<h; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2237 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2238 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2239 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2240 dst+=dstStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2241 src+=srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2242 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2243 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2244 \ |
5151 | 2245 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2246 const int w=2;\ |
4176 | 2247 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2248 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2249 for(i=0; i<w; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2250 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2251 const int srcB= src[-2*srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2252 const int srcA= src[-1*srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2253 const int src0= src[0 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2254 const int src1= src[1 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2255 const int src2= src[2 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2256 const int src3= src[3 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2257 const int src4= src[4 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2258 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2259 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2260 dst++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2261 src++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2262 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2263 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2264 \ |
5151 | 2265 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2266 const int h=2;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2267 const int w=2;\ |
4176 | 2268 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2269 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2270 src -= 2*srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2271 for(i=0; i<h+5; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2272 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2273 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2274 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2275 tmp+=tmpStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2276 src+=srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2277 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2278 tmp -= tmpStride*(h+5-2);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2279 for(i=0; i<w; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2280 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2281 const int tmpB= tmp[-2*tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2282 const int tmpA= tmp[-1*tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2283 const int tmp0= tmp[0 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2284 const int tmp1= tmp[1 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2285 const int tmp2= tmp[2 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2286 const int tmp3= tmp[3 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2287 const int tmp4= tmp[4 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2288 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2289 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2290 dst++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2291 tmp++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2292 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2293 }\ |
1168 | 2294 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2295 const int h=4;\ | |
4176 | 2296 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2297 int i;\ |
2298 for(i=0; i<h; i++)\ | |
2299 {\ | |
2300 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ | |
2301 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ | |
2302 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ | |
2303 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ | |
2304 dst+=dstStride;\ | |
2305 src+=srcStride;\ | |
2306 }\ | |
2307 }\ | |
2308 \ | |
2309 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2310 const int w=4;\ | |
4176 | 2311 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2312 int i;\ |
2313 for(i=0; i<w; i++)\ | |
2314 {\ | |
2315 const int srcB= src[-2*srcStride];\ | |
2316 const int srcA= src[-1*srcStride];\ | |
2317 const int src0= src[0 *srcStride];\ | |
2318 const int src1= src[1 *srcStride];\ | |
2319 const int src2= src[2 *srcStride];\ | |
2320 const int src3= src[3 *srcStride];\ | |
2321 const int src4= src[4 *srcStride];\ | |
2322 const int src5= src[5 *srcStride];\ | |
2323 const int src6= src[6 *srcStride];\ | |
2324 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ | |
2325 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ | |
2326 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ | |
2327 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ | |
2328 dst++;\ | |
2329 src++;\ | |
2330 }\ | |
2331 }\ | |
2332 \ | |
2333 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2334 const int h=4;\ | |
2335 const int w=4;\ | |
4176 | 2336 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2337 int i;\ |
2338 src -= 2*srcStride;\ | |
2339 for(i=0; i<h+5; i++)\ | |
2340 {\ | |
2341 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ | |
2342 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ | |
2343 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\ | |
2344 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\ | |
2345 tmp+=tmpStride;\ | |
2346 src+=srcStride;\ | |
2347 }\ | |
2348 tmp -= tmpStride*(h+5-2);\ | |
2349 for(i=0; i<w; i++)\ | |
2350 {\ | |
2351 const int tmpB= tmp[-2*tmpStride];\ | |
2352 const int tmpA= tmp[-1*tmpStride];\ | |
2353 const int tmp0= tmp[0 *tmpStride];\ | |
2354 const int tmp1= tmp[1 *tmpStride];\ | |
2355 const int tmp2= tmp[2 *tmpStride];\ | |
2356 const int tmp3= tmp[3 *tmpStride];\ | |
2357 const int tmp4= tmp[4 *tmpStride];\ | |
2358 const int tmp5= tmp[5 *tmpStride];\ | |
2359 const int tmp6= tmp[6 *tmpStride];\ | |
2360 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ | |
2361 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ | |
2362 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ | |
2363 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ | |
2364 dst++;\ | |
2365 tmp++;\ | |
2366 }\ | |
2367 }\ | |
2368 \ | |
2369 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2370 const int h=8;\ | |
4176 | 2371 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2372 int i;\ |
2373 for(i=0; i<h; i++)\ | |
2374 {\ | |
2375 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\ | |
2376 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\ | |
2377 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\ | |
2378 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\ | |
2379 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\ | |
2380 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\ | |
2381 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\ | |
2382 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\ | |
2383 dst+=dstStride;\ | |
2384 src+=srcStride;\ | |
2385 }\ | |
2386 }\ | |
2387 \ | |
2388 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2389 const int w=8;\ | |
4176 | 2390 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2391 int i;\ |
2392 for(i=0; i<w; i++)\ | |
2393 {\ | |
2394 const int srcB= src[-2*srcStride];\ | |
2395 const int srcA= src[-1*srcStride];\ | |
2396 const int src0= src[0 *srcStride];\ | |
2397 const int src1= src[1 *srcStride];\ | |
2398 const int src2= src[2 *srcStride];\ | |
2399 const int src3= src[3 *srcStride];\ | |
2400 const int src4= src[4 *srcStride];\ | |
2401 const int src5= src[5 *srcStride];\ | |
2402 const int src6= src[6 *srcStride];\ | |
2403 const int src7= src[7 *srcStride];\ | |
2404 const int src8= src[8 *srcStride];\ | |
2405 const int src9= src[9 *srcStride];\ | |
2406 const int src10=src[10*srcStride];\ | |
2407 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ | |
2408 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ | |
2409 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ | |
2410 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ | |
2411 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\ | |
2412 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\ | |
2413 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\ | |
2414 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\ | |
2415 dst++;\ | |
2416 src++;\ | |
2417 }\ | |
2418 }\ | |
2419 \ | |
2420 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2421 const int h=8;\ | |
2422 const int w=8;\ | |
4176 | 2423 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2424 int i;\ |
2425 src -= 2*srcStride;\ | |
2426 for(i=0; i<h+5; i++)\ | |
2427 {\ | |
2428 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\ | |
2429 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\ | |
2430 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\ | |
2431 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\ | |
2432 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\ | |
2433 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\ | |
2434 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\ | |
2435 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\ | |
2436 tmp+=tmpStride;\ | |
2437 src+=srcStride;\ | |
2438 }\ | |
2439 tmp -= tmpStride*(h+5-2);\ | |
2440 for(i=0; i<w; i++)\ | |
2441 {\ | |
2442 const int tmpB= tmp[-2*tmpStride];\ | |
2443 const int tmpA= tmp[-1*tmpStride];\ | |
2444 const int tmp0= tmp[0 *tmpStride];\ | |
2445 const int tmp1= tmp[1 *tmpStride];\ | |
2446 const int tmp2= tmp[2 *tmpStride];\ | |
2447 const int tmp3= tmp[3 *tmpStride];\ | |
2448 const int tmp4= tmp[4 *tmpStride];\ | |
2449 const int tmp5= tmp[5 *tmpStride];\ | |
2450 const int tmp6= tmp[6 *tmpStride];\ | |
2451 const int tmp7= tmp[7 *tmpStride];\ | |
2452 const int tmp8= tmp[8 *tmpStride];\ | |
2453 const int tmp9= tmp[9 *tmpStride];\ | |
2454 const int tmp10=tmp[10*tmpStride];\ | |
2455 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ | |
2456 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ | |
2457 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ | |
2458 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ | |
2459 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\ | |
2460 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\ | |
2461 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\ | |
2462 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\ | |
2463 dst++;\ | |
2464 tmp++;\ | |
2465 }\ | |
2466 }\ | |
2467 \ | |
2468 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2469 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ | |
2470 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2471 src += 8*srcStride;\ | |
2472 dst += 8*dstStride;\ | |
2473 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ | |
2474 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2475 }\ | |
2476 \ | |
2477 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2478 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ | |
2479 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2480 src += 8*srcStride;\ | |
2481 dst += 8*dstStride;\ | |
2482 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ | |
2483 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2484 }\ | |
2485 \ | |
2486 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2487 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2488 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2489 src += 8*srcStride;\ | |
2490 dst += 8*dstStride;\ | |
2491 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2492 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2493 }\ | |
2494 | |
2495 #define H264_MC(OPNAME, SIZE) \ | |
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\ | |
2497 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ | |
2498 }\ | |
2499 \ | |
2500 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2501 uint8_t half[SIZE*SIZE];\ | |
2502 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ | |
2503 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\ | |
2504 }\ | |
2505 \ | |
2506 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2507 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\ | |
2508 }\ | |
2509 \ | |
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2511 uint8_t half[SIZE*SIZE];\ | |
2512 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ | |
2513 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\ | |
2514 }\ | |
2515 \ | |
2516 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2517 uint8_t full[SIZE*(SIZE+5)];\ | |
2518 uint8_t * const full_mid= full + SIZE*2;\ | |
2519 uint8_t half[SIZE*SIZE];\ | |
2520 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2521 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ | |
2522 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\ | |
2523 }\ | |
2524 \ | |
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2526 uint8_t full[SIZE*(SIZE+5)];\ | |
2527 uint8_t * const full_mid= full + SIZE*2;\ | |
2528 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2529 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\ | |
2530 }\ | |
2531 \ | |
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2533 uint8_t full[SIZE*(SIZE+5)];\ | |
2534 uint8_t * const full_mid= full + SIZE*2;\ | |
2535 uint8_t half[SIZE*SIZE];\ | |
2536 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2537 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ | |
2538 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\ | |
2539 }\ | |
2540 \ | |
2541 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2542 uint8_t full[SIZE*(SIZE+5)];\ | |
2543 uint8_t * const full_mid= full + SIZE*2;\ | |
2544 uint8_t halfH[SIZE*SIZE];\ | |
2545 uint8_t halfV[SIZE*SIZE];\ | |
2546 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2547 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2548 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2549 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2550 }\ | |
2551 \ | |
2552 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2553 uint8_t full[SIZE*(SIZE+5)];\ | |
2554 uint8_t * const full_mid= full + SIZE*2;\ | |
2555 uint8_t halfH[SIZE*SIZE];\ | |
2556 uint8_t halfV[SIZE*SIZE];\ | |
2557 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2558 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2559 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2560 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2561 }\ | |
2562 \ | |
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2564 uint8_t full[SIZE*(SIZE+5)];\ | |
2565 uint8_t * const full_mid= full + SIZE*2;\ | |
2566 uint8_t halfH[SIZE*SIZE];\ | |
2567 uint8_t halfV[SIZE*SIZE];\ | |
2568 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2569 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2570 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2571 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2572 }\ | |
2573 \ | |
2574 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2575 uint8_t full[SIZE*(SIZE+5)];\ | |
2576 uint8_t * const full_mid= full + SIZE*2;\ | |
2577 uint8_t halfH[SIZE*SIZE];\ | |
2578 uint8_t halfV[SIZE*SIZE];\ | |
2579 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2580 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2581 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2582 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2583 }\ | |
2584 \ | |
2585 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2586 int16_t tmp[SIZE*(SIZE+5)];\ | |
2587 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\ | |
2588 }\ | |
2589 \ | |
2590 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2591 int16_t tmp[SIZE*(SIZE+5)];\ | |
2592 uint8_t halfH[SIZE*SIZE];\ | |
2593 uint8_t halfHV[SIZE*SIZE];\ | |
2594 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2595 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2596 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2597 }\ | |
2598 \ | |
2599 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2600 int16_t tmp[SIZE*(SIZE+5)];\ | |
2601 uint8_t halfH[SIZE*SIZE];\ | |
2602 uint8_t halfHV[SIZE*SIZE];\ | |
2603 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2604 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2605 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2606 }\ | |
2607 \ | |
2608 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2609 uint8_t full[SIZE*(SIZE+5)];\ | |
2610 uint8_t * const full_mid= full + SIZE*2;\ | |
2611 int16_t tmp[SIZE*(SIZE+5)];\ | |
2612 uint8_t halfV[SIZE*SIZE];\ | |
2613 uint8_t halfHV[SIZE*SIZE];\ | |
2614 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2615 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2616 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2617 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2618 }\ | |
2619 \ | |
2620 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2621 uint8_t full[SIZE*(SIZE+5)];\ | |
2622 uint8_t * const full_mid= full + SIZE*2;\ | |
2623 int16_t tmp[SIZE*(SIZE+5)];\ | |
2624 uint8_t halfV[SIZE*SIZE];\ | |
2625 uint8_t halfHV[SIZE*SIZE];\ | |
2626 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2627 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2628 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2629 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2630 }\ | |
2631 | |
2632 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) | |
2633 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7) | |
2634 #define op_put(a, b) a = cm[((b) + 16)>>5] | |
2635 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1) | |
2636 #define op2_put(a, b) a = cm[((b) + 512)>>10] | |
2637 | |
2638 H264_LOWPASS(put_ , op_put, op2_put) | |
2639 H264_LOWPASS(avg_ , op_avg, op2_avg) | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2640 H264_MC(put_, 2) |
1168 | 2641 H264_MC(put_, 4) |
2642 H264_MC(put_, 8) | |
2643 H264_MC(put_, 16) | |
2644 H264_MC(avg_, 4) | |
2645 H264_MC(avg_, 8) | |
2646 H264_MC(avg_, 16) | |
2647 | |
2648 #undef op_avg | |
2649 #undef op_put | |
2650 #undef op2_avg | |
2651 #undef op2_put | |
2652 #endif | |
2653 | |
4594 | 2654 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom ) |
2655 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) | |
2415 | 2656 #define H264_WEIGHT(W,H) \ |
2657 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ | |
3029 | 2658 int y; \ |
2415 | 2659 offset <<= log2_denom; \ |
2660 if(log2_denom) offset += 1<<(log2_denom-1); \ | |
2661 for(y=0; y<H; y++, block += stride){ \ | |
2662 op_scale1(0); \ | |
2663 op_scale1(1); \ | |
2664 if(W==2) continue; \ | |
2665 op_scale1(2); \ | |
2666 op_scale1(3); \ | |
2667 if(W==4) continue; \ | |
2668 op_scale1(4); \ | |
2669 op_scale1(5); \ | |
2670 op_scale1(6); \ | |
2671 op_scale1(7); \ | |
2672 if(W==8) continue; \ | |
2673 op_scale1(8); \ | |
2674 op_scale1(9); \ | |
2675 op_scale1(10); \ | |
2676 op_scale1(11); \ | |
2677 op_scale1(12); \ | |
2678 op_scale1(13); \ | |
2679 op_scale1(14); \ | |
2680 op_scale1(15); \ | |
2681 } \ | |
2682 } \ | |
3029 | 2683 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ |
2684 int y; \ | |
2685 offset = ((offset + 1) | 1) << log2_denom; \ | |
2415 | 2686 for(y=0; y<H; y++, dst += stride, src += stride){ \ |
2687 op_scale2(0); \ | |
2688 op_scale2(1); \ | |
2689 if(W==2) continue; \ | |
2690 op_scale2(2); \ | |
2691 op_scale2(3); \ | |
2692 if(W==4) continue; \ | |
2693 op_scale2(4); \ | |
2694 op_scale2(5); \ | |
2695 op_scale2(6); \ | |
2696 op_scale2(7); \ | |
2697 if(W==8) continue; \ | |
2698 op_scale2(8); \ | |
2699 op_scale2(9); \ | |
2700 op_scale2(10); \ | |
2701 op_scale2(11); \ | |
2702 op_scale2(12); \ | |
2703 op_scale2(13); \ | |
2704 op_scale2(14); \ | |
2705 op_scale2(15); \ | |
2706 } \ | |
2707 } | |
2708 | |
2709 H264_WEIGHT(16,16) | |
2710 H264_WEIGHT(16,8) | |
2711 H264_WEIGHT(8,16) | |
2712 H264_WEIGHT(8,8) | |
2713 H264_WEIGHT(8,4) | |
2714 H264_WEIGHT(4,8) | |
2715 H264_WEIGHT(4,4) | |
2716 H264_WEIGHT(4,2) | |
2717 H264_WEIGHT(2,4) | |
2718 H264_WEIGHT(2,2) | |
2719 | |
2720 #undef op_scale1 | |
2721 #undef op_scale2 | |
2722 #undef H264_WEIGHT | |
2723 | |
936 | 2724 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){ |
4176 | 2725 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
936 | 2726 int i; |
2727 | |
2728 for(i=0; i<h; i++){ | |
2729 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4]; | |
2730 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4]; | |
2731 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4]; | |
2732 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4]; | |
2733 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4]; | |
2734 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4]; | |
2735 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4]; | |
2736 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4]; | |
2737 dst+=dstStride; | |
2967 | 2738 src+=srcStride; |
936 | 2739 } |
2740 } | |
2741 | |
8590 | 2742 #if CONFIG_CAVS_DECODER |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2743 /* AVS specific */ |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2744 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2745 |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2746 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2747 put_pixels8_c(dst, src, stride, 8); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2748 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2749 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2750 avg_pixels8_c(dst, src, stride, 8); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2751 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2752 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2753 put_pixels16_c(dst, src, stride, 16); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2754 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2755 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2756 avg_pixels16_c(dst, src, stride, 16); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2757 } |
3432 | 2758 #endif /* CONFIG_CAVS_DECODER */ |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2759 |
9586
c7420bfe4da0
Don't #if a function declaration and properly indent it.
ramiro
parents:
9585
diff
changeset
|
2760 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx); |
9585 | 2761 |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
2762 #if CONFIG_VC1_DECODER |
3526 | 2763 /* VC-1 specific */ |
2764 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx); | |
2765 | |
2766 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) { | |
2767 put_pixels8_c(dst, src, stride, 8); | |
2768 } | |
9437 | 2769 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) { |
2770 avg_pixels8_c(dst, src, stride, 8); | |
2771 } | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
2772 #endif /* CONFIG_VC1_DECODER */ |
3526 | 2773 |
5887 | 2774 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx); |
5899 | 2775 |
4296 | 2776 /* H264 specific */ |
5411
362aec4ef932
Take care of some renames (Doxygen and function name) after the previous pure rename patch.
takis
parents:
5394
diff
changeset
|
2777 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx); |
4296 | 2778 |
8590 | 2779 #if CONFIG_RV30_DECODER |
8410 | 2780 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx); |
2781 #endif /* CONFIG_RV30_DECODER */ | |
2782 | |
8590 | 2783 #if CONFIG_RV40_DECODER |
8232 | 2784 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){ |
2785 put_pixels16_xy2_c(dst, src, stride, 16); | |
2786 } | |
2787 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2788 avg_pixels16_xy2_c(dst, src, stride, 16); | |
2789 } | |
2790 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2791 put_pixels8_xy2_c(dst, src, stride, 8); | |
2792 } | |
2793 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2794 avg_pixels8_xy2_c(dst, src, stride, 8); | |
2795 } | |
2796 | |
2797 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx); | |
2798 #endif /* CONFIG_RV40_DECODER */ | |
2799 | |
936 | 2800 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){ |
4176 | 2801 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
936 | 2802 int i; |
2803 | |
2804 for(i=0; i<w; i++){ | |
2805 const int src_1= src[ -srcStride]; | |
2806 const int src0 = src[0 ]; | |
2807 const int src1 = src[ srcStride]; | |
2808 const int src2 = src[2*srcStride]; | |
2809 const int src3 = src[3*srcStride]; | |
2810 const int src4 = src[4*srcStride]; | |
2811 const int src5 = src[5*srcStride]; | |
2812 const int src6 = src[6*srcStride]; | |
2813 const int src7 = src[7*srcStride]; | |
2814 const int src8 = src[8*srcStride]; | |
2815 const int src9 = src[9*srcStride]; | |
2816 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; | |
2817 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; | |
2818 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; | |
2819 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; | |
2820 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; | |
2821 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; | |
2822 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; | |
2823 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; | |
2824 src++; | |
2825 dst++; | |
2826 } | |
2827 } | |
2828 | |
2829 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){ | |
2830 put_pixels8_c(dst, src, stride, 8); | |
2831 } | |
2832 | |
2833 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){ | |
2834 uint8_t half[64]; | |
2835 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); | |
2836 put_pixels8_l2(dst, src, half, stride, stride, 8, 8); | |
2837 } | |
2838 | |
2839 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){ | |
2840 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8); | |
2841 } | |
2842 | |
2843 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){ | |
2844 uint8_t half[64]; | |
2845 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); | |
2846 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8); | |
2847 } | |
2848 | |
2849 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){ | |
2850 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8); | |
2851 } | |
2852 | |
2853 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){ | |
2854 uint8_t halfH[88]; | |
2855 uint8_t halfV[64]; | |
2856 uint8_t halfHV[64]; | |
2857 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2858 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8); | |
2859 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8); | |
2860 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8); | |
2861 } | |
2862 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){ | |
2863 uint8_t halfH[88]; | |
2864 uint8_t halfV[64]; | |
2865 uint8_t halfHV[64]; | |
2866 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2867 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8); | |
2868 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8); | |
2869 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8); | |
2870 } | |
2871 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){ | |
2872 uint8_t halfH[88]; | |
2873 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2874 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8); | |
2875 } | |
2876 | |
1644 | 2877 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){ |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10748
diff
changeset
|
2878 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
1644 | 2879 int x; |
2880 const int strength= ff_h263_loop_filter_strength[qscale]; | |
2967 | 2881 |
1644 | 2882 for(x=0; x<8; x++){ |
2883 int d1, d2, ad1; | |
2884 int p0= src[x-2*stride]; | |
2885 int p1= src[x-1*stride]; | |
2886 int p2= src[x+0*stride]; | |
2887 int p3= src[x+1*stride]; | |
2888 int d = (p0 - p3 + 4*(p2 - p1)) / 8; | |
2889 | |
2890 if (d<-2*strength) d1= 0; | |
2891 else if(d<- strength) d1=-2*strength - d; | |
2892 else if(d< strength) d1= d; | |
2893 else if(d< 2*strength) d1= 2*strength - d; | |
2894 else d1= 0; | |
2967 | 2895 |
1644 | 2896 p1 += d1; |
2897 p2 -= d1; | |
2898 if(p1&256) p1= ~(p1>>31); | |
2899 if(p2&256) p2= ~(p2>>31); | |
2967 | 2900 |
1644 | 2901 src[x-1*stride] = p1; |
2902 src[x+0*stride] = p2; | |
2903 | |
4001 | 2904 ad1= FFABS(d1)>>1; |
2967 | 2905 |
4594 | 2906 d2= av_clip((p0-p3)/4, -ad1, ad1); |
2967 | 2907 |
1644 | 2908 src[x-2*stride] = p0 - d2; |
2909 src[x+ stride] = p3 + d2; | |
2910 } | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5291
diff
changeset
|
2911 } |
1644 | 2912 } |
2913 | |
2914 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){ | |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10748
diff
changeset
|
2915 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
1644 | 2916 int y; |
2917 const int strength= ff_h263_loop_filter_strength[qscale]; | |
2967 | 2918 |
1644 | 2919 for(y=0; y<8; y++){ |
2920 int d1, d2, ad1; | |
2921 int p0= src[y*stride-2]; | |
2922 int p1= src[y*stride-1]; | |
2923 int p2= src[y*stride+0]; | |
2924 int p3= src[y*stride+1]; | |
2925 int d = (p0 - p3 + 4*(p2 - p1)) / 8; | |
2926 | |
2927 if (d<-2*strength) d1= 0; | |
2928 else if(d<- strength) d1=-2*strength - d; | |
2929 else if(d< strength) d1= d; | |
2930 else if(d< 2*strength) d1= 2*strength - d; | |
2931 else d1= 0; | |
2967 | 2932 |
1644 | 2933 p1 += d1; |
2934 p2 -= d1; | |
2935 if(p1&256) p1= ~(p1>>31); | |
2936 if(p2&256) p2= ~(p2>>31); | |
2967 | 2937 |
1644 | 2938 src[y*stride-1] = p1; |
2939 src[y*stride+0] = p2; | |
2940 | |
4001 | 2941 ad1= FFABS(d1)>>1; |
2967 | 2942 |
4594 | 2943 d2= av_clip((p0-p3)/4, -ad1, ad1); |
2967 | 2944 |
1644 | 2945 src[y*stride-2] = p0 - d2; |
2946 src[y*stride+1] = p3 + d2; | |
2947 } | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5291
diff
changeset
|
2948 } |
1644 | 2949 } |
936 | 2950 |
2045 | 2951 static void h261_loop_filter_c(uint8_t *src, int stride){ |
2952 int x,y,xy,yz; | |
2953 int temp[64]; | |
2954 | |
2955 for(x=0; x<8; x++){ | |
2956 temp[x ] = 4*src[x ]; | |
2957 temp[x + 7*8] = 4*src[x + 7*stride]; | |
2958 } | |
2959 for(y=1; y<7; y++){ | |
2960 for(x=0; x<8; x++){ | |
2961 xy = y * stride + x; | |
2962 yz = y * 8 + x; | |
2963 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride]; | |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2964 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2965 } |
2967 | 2966 |
2045 | 2967 for(y=0; y<8; y++){ |
2968 src[ y*stride] = (temp[ y*8] + 2)>>2; | |
2969 src[7+y*stride] = (temp[7+y*8] + 2)>>2; | |
2970 for(x=1; x<7; x++){ | |
2971 xy = y * stride + x; | |
2972 yz = y * 8 + x; | |
2973 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4; | |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2974 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2975 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2976 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2977 |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
2978 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
2633 | 2979 { |
2980 int i, d; | |
2981 for( i = 0; i < 4; i++ ) { | |
2982 if( tc0[i] < 0 ) { | |
2983 pix += 4*ystride; | |
2984 continue; | |
2985 } | |
2986 for( d = 0; d < 4; d++ ) { | |
2987 const int p0 = pix[-1*xstride]; | |
2988 const int p1 = pix[-2*xstride]; | |
2989 const int p2 = pix[-3*xstride]; | |
2990 const int q0 = pix[0]; | |
2991 const int q1 = pix[1*xstride]; | |
2992 const int q2 = pix[2*xstride]; | |
2967 | 2993 |
4001 | 2994 if( FFABS( p0 - q0 ) < alpha && |
2995 FFABS( p1 - p0 ) < beta && | |
2996 FFABS( q1 - q0 ) < beta ) { | |
2967 | 2997 |
2633 | 2998 int tc = tc0[i]; |
2999 int i_delta; | |
2967 | 3000 |
4001 | 3001 if( FFABS( p2 - p0 ) < beta ) { |
4594 | 3002 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); |
2633 | 3003 tc++; |
3004 } | |
4001 | 3005 if( FFABS( q2 - q0 ) < beta ) { |
4594 | 3006 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); |
2633 | 3007 tc++; |
3008 } | |
2967 | 3009 |
4594 | 3010 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); |
3011 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ | |
3012 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ | |
2633 | 3013 } |
3014 pix += ystride; | |
3015 } | |
3016 } | |
3017 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3018 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3019 { |
3020 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); | |
3021 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3022 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3023 { |
3024 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); | |
3025 } | |
3026 | |
8395
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3027 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3028 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3029 int d; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3030 for( d = 0; d < 16; d++ ) { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3031 const int p2 = pix[-3*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3032 const int p1 = pix[-2*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3033 const int p0 = pix[-1*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3034 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3035 const int q0 = pix[ 0*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3036 const int q1 = pix[ 1*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3037 const int q2 = pix[ 2*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3038 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3039 if( FFABS( p0 - q0 ) < alpha && |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3040 FFABS( p1 - p0 ) < beta && |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3041 FFABS( q1 - q0 ) < beta ) { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3042 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3043 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3044 if( FFABS( p2 - p0 ) < beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3045 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3046 const int p3 = pix[-4*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3047 /* p0', p1', p2' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3048 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3049 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3050 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3051 } else { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3052 /* p0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3053 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3054 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3055 if( FFABS( q2 - q0 ) < beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3056 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3057 const int q3 = pix[3*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3058 /* q0', q1', q2' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3059 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3060 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3061 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3062 } else { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3063 /* q0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3064 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3065 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3066 }else{ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3067 /* p0', q0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3068 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3069 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3070 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3071 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3072 pix += ystride; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3073 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3074 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3075 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3076 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3077 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3078 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3079 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3080 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3081 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3082 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3083 |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3084 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
2633 | 3085 { |
3086 int i, d; | |
3087 for( i = 0; i < 4; i++ ) { | |
3088 const int tc = tc0[i]; | |
3089 if( tc <= 0 ) { | |
3090 pix += 2*ystride; | |
3091 continue; | |
3092 } | |
3093 for( d = 0; d < 2; d++ ) { | |
3094 const int p0 = pix[-1*xstride]; | |
3095 const int p1 = pix[-2*xstride]; | |
3096 const int q0 = pix[0]; | |
3097 const int q1 = pix[1*xstride]; | |
3098 | |
4001 | 3099 if( FFABS( p0 - q0 ) < alpha && |
3100 FFABS( p1 - p0 ) < beta && | |
3101 FFABS( q1 - q0 ) < beta ) { | |
2633 | 3102 |
4594 | 3103 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); |
3104 | |
3105 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ | |
3106 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ | |
2633 | 3107 } |
3108 pix += ystride; | |
3109 } | |
3110 } | |
3111 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3112 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3113 { |
3114 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); | |
3115 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3116 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3117 { |
3118 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); | |
3119 } | |
3120 | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3121 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3122 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3123 int d; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3124 for( d = 0; d < 8; d++ ) { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3125 const int p0 = pix[-1*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3126 const int p1 = pix[-2*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3127 const int q0 = pix[0]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3128 const int q1 = pix[1*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3129 |
4001 | 3130 if( FFABS( p0 - q0 ) < alpha && |
3131 FFABS( p1 - p0 ) < beta && | |
3132 FFABS( q1 - q0 ) < beta ) { | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3133 |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3134 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3135 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3136 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3137 pix += ystride; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3138 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3139 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3140 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3141 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3142 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3143 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3144 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3145 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3146 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3147 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3148 |
1708 | 3149 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3150 { |
3151 int s, i; | |
3152 | |
3153 s = 0; | |
1708 | 3154 for(i=0;i<h;i++) { |
0 | 3155 s += abs(pix1[0] - pix2[0]); |
3156 s += abs(pix1[1] - pix2[1]); | |
3157 s += abs(pix1[2] - pix2[2]); | |
3158 s += abs(pix1[3] - pix2[3]); | |
3159 s += abs(pix1[4] - pix2[4]); | |
3160 s += abs(pix1[5] - pix2[5]); | |
3161 s += abs(pix1[6] - pix2[6]); | |
3162 s += abs(pix1[7] - pix2[7]); | |
3163 s += abs(pix1[8] - pix2[8]); | |
3164 s += abs(pix1[9] - pix2[9]); | |
3165 s += abs(pix1[10] - pix2[10]); | |
3166 s += abs(pix1[11] - pix2[11]); | |
3167 s += abs(pix1[12] - pix2[12]); | |
3168 s += abs(pix1[13] - pix2[13]); | |
3169 s += abs(pix1[14] - pix2[14]); | |
3170 s += abs(pix1[15] - pix2[15]); | |
3171 pix1 += line_size; | |
3172 pix2 += line_size; | |
3173 } | |
3174 return s; | |
3175 } | |
3176 | |
1708 | 3177 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3178 { |
3179 int s, i; | |
3180 | |
3181 s = 0; | |
1708 | 3182 for(i=0;i<h;i++) { |
0 | 3183 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
3184 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
3185 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
3186 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
3187 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
3188 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
3189 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
3190 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
3191 s += abs(pix1[8] - avg2(pix2[8], pix2[9])); | |
3192 s += abs(pix1[9] - avg2(pix2[9], pix2[10])); | |
3193 s += abs(pix1[10] - avg2(pix2[10], pix2[11])); | |
3194 s += abs(pix1[11] - avg2(pix2[11], pix2[12])); | |
3195 s += abs(pix1[12] - avg2(pix2[12], pix2[13])); | |
3196 s += abs(pix1[13] - avg2(pix2[13], pix2[14])); | |
3197 s += abs(pix1[14] - avg2(pix2[14], pix2[15])); | |
3198 s += abs(pix1[15] - avg2(pix2[15], pix2[16])); | |
3199 pix1 += line_size; | |
3200 pix2 += line_size; | |
3201 } | |
3202 return s; | |
3203 } | |
3204 | |
1708 | 3205 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3206 { |
3207 int s, i; | |
1064 | 3208 uint8_t *pix3 = pix2 + line_size; |
0 | 3209 |
3210 s = 0; | |
1708 | 3211 for(i=0;i<h;i++) { |
0 | 3212 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
3213 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
3214 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
3215 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
3216 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
3217 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
3218 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
3219 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
3220 s += abs(pix1[8] - avg2(pix2[8], pix3[8])); | |
3221 s += abs(pix1[9] - avg2(pix2[9], pix3[9])); | |
3222 s += abs(pix1[10] - avg2(pix2[10], pix3[10])); | |
3223 s += abs(pix1[11] - avg2(pix2[11], pix3[11])); | |
3224 s += abs(pix1[12] - avg2(pix2[12], pix3[12])); | |
3225 s += abs(pix1[13] - avg2(pix2[13], pix3[13])); | |
3226 s += abs(pix1[14] - avg2(pix2[14], pix3[14])); | |
3227 s += abs(pix1[15] - avg2(pix2[15], pix3[15])); | |
3228 pix1 += line_size; | |
3229 pix2 += line_size; | |
3230 pix3 += line_size; | |
3231 } | |
3232 return s; | |
3233 } | |
3234 | |
1708 | 3235 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3236 { |
3237 int s, i; | |
1064 | 3238 uint8_t *pix3 = pix2 + line_size; |
0 | 3239 |
3240 s = 0; | |
1708 | 3241 for(i=0;i<h;i++) { |
0 | 3242 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
3243 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
3244 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
3245 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
3246 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
3247 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
3248 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
3249 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
3250 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); | |
3251 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); | |
3252 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); | |
3253 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); | |
3254 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); | |
3255 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); | |
3256 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); | |
3257 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); | |
3258 pix1 += line_size; | |
3259 pix2 += line_size; | |
3260 pix3 += line_size; | |
3261 } | |
3262 return s; | |
3263 } | |
3264 | |
1708 | 3265 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3266 { |
3267 int s, i; | |
3268 | |
3269 s = 0; | |
1708 | 3270 for(i=0;i<h;i++) { |
294 | 3271 s += abs(pix1[0] - pix2[0]); |
3272 s += abs(pix1[1] - pix2[1]); | |
3273 s += abs(pix1[2] - pix2[2]); | |
3274 s += abs(pix1[3] - pix2[3]); | |
3275 s += abs(pix1[4] - pix2[4]); | |
3276 s += abs(pix1[5] - pix2[5]); | |
3277 s += abs(pix1[6] - pix2[6]); | |
3278 s += abs(pix1[7] - pix2[7]); | |
3279 pix1 += line_size; | |
3280 pix2 += line_size; | |
3281 } | |
3282 return s; | |
3283 } | |
3284 | |
1708 | 3285 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3286 { |
3287 int s, i; | |
3288 | |
3289 s = 0; | |
1708 | 3290 for(i=0;i<h;i++) { |
294 | 3291 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
3292 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
3293 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
3294 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
3295 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
3296 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
3297 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
3298 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
3299 pix1 += line_size; | |
3300 pix2 += line_size; | |
3301 } | |
3302 return s; | |
3303 } | |
3304 | |
1708 | 3305 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3306 { |
3307 int s, i; | |
1064 | 3308 uint8_t *pix3 = pix2 + line_size; |
294 | 3309 |
3310 s = 0; | |
1708 | 3311 for(i=0;i<h;i++) { |
294 | 3312 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
3313 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
3314 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
3315 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
3316 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
3317 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
3318 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
3319 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
3320 pix1 += line_size; | |
3321 pix2 += line_size; | |
3322 pix3 += line_size; | |
3323 } | |
3324 return s; | |
3325 } | |
3326 | |
1708 | 3327 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3328 { |
3329 int s, i; | |
1064 | 3330 uint8_t *pix3 = pix2 + line_size; |
294 | 3331 |
3332 s = 0; | |
1708 | 3333 for(i=0;i<h;i++) { |
294 | 3334 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
3335 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
3336 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
3337 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
3338 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
3339 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
3340 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
3341 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
3342 pix1 += line_size; | |
3343 pix2 += line_size; | |
3344 pix3 += line_size; | |
3345 } | |
3346 return s; | |
3347 } | |
3348 | |
2834 | 3349 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){ |
3350 MpegEncContext *c = v; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3351 int score1=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3352 int score2=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3353 int x,y; |
2066 | 3354 |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3355 for(y=0; y<h; y++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3356 for(x=0; x<16; x++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3357 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3358 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3359 if(y+1<h){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3360 for(x=0; x<15; x++){ |
4001 | 3361 score2+= FFABS( s1[x ] - s1[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3362 - s1[x+1] + s1[x+1+stride]) |
4001 | 3363 -FFABS( s2[x ] - s2[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3364 - s2[x+1] + s2[x+1+stride]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3365 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3366 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3367 s1+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3368 s2+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3369 } |
2066 | 3370 |
4001 | 3371 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
3372 else return score1 + FFABS(score2)*8; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3373 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3374 |
2834 | 3375 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){ |
3376 MpegEncContext *c = v; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3377 int score1=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3378 int score2=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3379 int x,y; |
2967 | 3380 |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3381 for(y=0; y<h; y++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3382 for(x=0; x<8; x++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3383 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3384 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3385 if(y+1<h){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3386 for(x=0; x<7; x++){ |
4001 | 3387 score2+= FFABS( s1[x ] - s1[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3388 - s1[x+1] + s1[x+1+stride]) |
4001 | 3389 -FFABS( s2[x ] - s2[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3390 - s2[x+1] + s2[x+1+stride]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3391 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3392 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3393 s1+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3394 s2+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3395 } |
2967 | 3396 |
4001 | 3397 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
3398 else return score1 + FFABS(score2)*8; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3399 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3400 |
1784 | 3401 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ |
3402 int i; | |
3403 unsigned int sum=0; | |
3404 | |
3405 for(i=0; i<8*8; i++){ | |
3406 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT)); | |
3407 int w= weight[i]; | |
3408 b>>= RECON_SHIFT; | |
3409 assert(-512<b && b<512); | |
3410 | |
3411 sum += (w*b)*(w*b)>>4; | |
3412 } | |
3413 return sum>>2; | |
3414 } | |
3415 | |
3416 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){ | |
3417 int i; | |
3418 | |
3419 for(i=0; i<8*8; i++){ | |
3420 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); | |
2967 | 3421 } |
1784 | 3422 } |
3423 | |
1100 | 3424 /** |
3425 * permutes an 8x8 block. | |
1101 | 3426 * @param block the block which will be permuted according to the given permutation vector |
1100 | 3427 * @param permutation the permutation vector |
3428 * @param last the last non zero coefficient in scantable order, used to speed the permutation up | |
2967 | 3429 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not |
1101 | 3430 * (inverse) permutated to scantable order! |
1100 | 3431 */ |
1064 | 3432 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last) |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3433 { |
764 | 3434 int i; |
945 | 3435 DCTELEM temp[64]; |
2967 | 3436 |
764 | 3437 if(last<=0) return; |
5129 | 3438 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3439 |
764 | 3440 for(i=0; i<=last; i++){ |
3441 const int j= scantable[i]; | |
3442 temp[j]= block[j]; | |
3443 block[j]=0; | |
3444 } | |
2967 | 3445 |
764 | 3446 for(i=0; i<=last; i++){ |
3447 const int j= scantable[i]; | |
3448 const int perm_j= permutation[j]; | |
3449 block[perm_j]= temp[j]; | |
3450 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3451 } |
34 | 3452 |
1729 | 3453 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){ |
3454 return 0; | |
3455 } | |
3456 | |
3457 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){ | |
3458 int i; | |
2967 | 3459 |
8976
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3460 memset(cmp, 0, sizeof(void*)*6); |
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3461 |
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3462 for(i=0; i<6; i++){ |
1729 | 3463 switch(type&0xFF){ |
3464 case FF_CMP_SAD: | |
3465 cmp[i]= c->sad[i]; | |
3466 break; | |
3467 case FF_CMP_SATD: | |
3468 cmp[i]= c->hadamard8_diff[i]; | |
3469 break; | |
3470 case FF_CMP_SSE: | |
3471 cmp[i]= c->sse[i]; | |
3472 break; | |
3473 case FF_CMP_DCT: | |
3474 cmp[i]= c->dct_sad[i]; | |
3475 break; | |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3476 case FF_CMP_DCT264: |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3477 cmp[i]= c->dct264_sad[i]; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3478 break; |
2382 | 3479 case FF_CMP_DCTMAX: |
3480 cmp[i]= c->dct_max[i]; | |
3481 break; | |
1729 | 3482 case FF_CMP_PSNR: |
3483 cmp[i]= c->quant_psnr[i]; | |
3484 break; | |
3485 case FF_CMP_BIT: | |
3486 cmp[i]= c->bit[i]; | |
3487 break; | |
3488 case FF_CMP_RD: | |
3489 cmp[i]= c->rd[i]; | |
3490 break; | |
3491 case FF_CMP_VSAD: | |
3492 cmp[i]= c->vsad[i]; | |
3493 break; | |
3494 case FF_CMP_VSSE: | |
3495 cmp[i]= c->vsse[i]; | |
3496 break; | |
3497 case FF_CMP_ZERO: | |
3498 cmp[i]= zero_cmp; | |
3499 break; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3500 case FF_CMP_NSSE: |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3501 cmp[i]= c->nsse[i]; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3502 break; |
8590 | 3503 #if CONFIG_SNOW_ENCODER |
2184 | 3504 case FF_CMP_W53: |
3505 cmp[i]= c->w53[i]; | |
3506 break; | |
3507 case FF_CMP_W97: | |
3508 cmp[i]= c->w97[i]; | |
3509 break; | |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
3510 #endif |
1729 | 3511 default: |
3512 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n"); | |
3513 } | |
3514 } | |
3515 } | |
3516 | |
8288 | 3517 static void clear_block_c(DCTELEM *block) |
3518 { | |
3519 memset(block, 0, sizeof(DCTELEM)*64); | |
3520 } | |
3521 | |
1101 | 3522 /** |
3523 * memset(blocks, 0, sizeof(DCTELEM)*6*64) | |
3524 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3525 static void clear_blocks_c(DCTELEM *blocks) |
296 | 3526 { |
3527 memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
3528 } | |
3529 | |
866 | 3530 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){ |
6385 | 3531 long i; |
3532 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ | |
3533 long a = *(long*)(src+i); | |
3534 long b = *(long*)(dst+i); | |
3535 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); | |
866 | 3536 } |
3537 for(; i<w; i++) | |
3538 dst[i+0] += src[i+0]; | |
3539 } | |
3540 | |
6384 | 3541 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
6385 | 3542 long i; |
6384 | 3543 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ |
3544 long a = *(long*)(src1+i); | |
3545 long b = *(long*)(src2+i); | |
6385 | 3546 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); |
6384 | 3547 } |
3548 for(; i<w; i++) | |
3549 dst[i] = src1[i]+src2[i]; | |
3550 } | |
3551 | |
866 | 3552 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
6385 | 3553 long i; |
8590 | 3554 #if !HAVE_FAST_UNALIGNED |
6385 | 3555 if((long)src2 & (sizeof(long)-1)){ |
6386 | 3556 for(i=0; i+7<w; i+=8){ |
3557 dst[i+0] = src1[i+0]-src2[i+0]; | |
3558 dst[i+1] = src1[i+1]-src2[i+1]; | |
3559 dst[i+2] = src1[i+2]-src2[i+2]; | |
3560 dst[i+3] = src1[i+3]-src2[i+3]; | |
3561 dst[i+4] = src1[i+4]-src2[i+4]; | |
3562 dst[i+5] = src1[i+5]-src2[i+5]; | |
3563 dst[i+6] = src1[i+6]-src2[i+6]; | |
3564 dst[i+7] = src1[i+7]-src2[i+7]; | |
3565 } | |
6385 | 3566 }else |
3567 #endif | |
3568 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ | |
3569 long a = *(long*)(src1+i); | |
3570 long b = *(long*)(src2+i); | |
3571 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80); | |
3572 } | |
866 | 3573 for(; i<w; i++) |
3574 dst[i+0] = src1[i+0]-src2[i+0]; | |
3575 } | |
3576 | |
10431 | 3577 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){ |
8760 | 3578 int i; |
3579 uint8_t l, lt; | |
3580 | |
3581 l= *left; | |
3582 lt= *left_top; | |
3583 | |
3584 for(i=0; i<w; i++){ | |
3585 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i]; | |
3586 lt= src1[i]; | |
3587 dst[i]= l; | |
3588 } | |
3589 | |
3590 *left= l; | |
3591 *left_top= lt; | |
3592 } | |
3593 | |
10431 | 3594 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){ |
1527 | 3595 int i; |
3596 uint8_t l, lt; | |
3597 | |
3598 l= *left; | |
3599 lt= *left_top; | |
3600 | |
3601 for(i=0; i<w; i++){ | |
3602 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF); | |
3603 lt= src1[i]; | |
3604 l= src2[i]; | |
3605 dst[i]= l - pred; | |
2967 | 3606 } |
1527 | 3607 |
3608 *left= l; | |
3609 *left_top= lt; | |
3610 } | |
3611 | |
10420
442ab0c41eae
Huffyuv: Add missing const to src pointers in dsputil functions.
astrange
parents:
10370
diff
changeset
|
3612 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){ |
10370 | 3613 int i; |
3614 | |
3615 for(i=0; i<w-1; i++){ | |
3616 acc+= src[i]; | |
3617 dst[i]= acc; | |
3618 i++; | |
3619 acc+= src[i]; | |
3620 dst[i]= acc; | |
3621 } | |
3622 | |
3623 for(; i<w; i++){ | |
3624 acc+= src[i]; | |
3625 dst[i]= acc; | |
3626 } | |
3627 | |
3628 return acc; | |
3629 } | |
3630 | |
3631 #if HAVE_BIGENDIAN | |
3632 #define B 3 | |
3633 #define G 2 | |
3634 #define R 1 | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3635 #define A 0 |
10370 | 3636 #else |
3637 #define B 0 | |
3638 #define G 1 | |
3639 #define R 2 | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3640 #define A 3 |
10370 | 3641 #endif |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3642 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){ |
10370 | 3643 int i; |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3644 int r,g,b,a; |
10370 | 3645 r= *red; |
3646 g= *green; | |
3647 b= *blue; | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3648 a= *alpha; |
10370 | 3649 |
3650 for(i=0; i<w; i++){ | |
3651 b+= src[4*i+B]; | |
3652 g+= src[4*i+G]; | |
3653 r+= src[4*i+R]; | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3654 a+= src[4*i+A]; |
10370 | 3655 |
3656 dst[4*i+B]= b; | |
3657 dst[4*i+G]= g; | |
3658 dst[4*i+R]= r; | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3659 dst[4*i+A]= a; |
10370 | 3660 } |
3661 | |
3662 *red= r; | |
3663 *green= g; | |
3664 *blue= b; | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3665 *alpha= a; |
10370 | 3666 } |
3667 #undef B | |
3668 #undef G | |
3669 #undef R | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3670 #undef A |
10370 | 3671 |
936 | 3672 #define BUTTERFLY2(o1,o2,i1,i2) \ |
3673 o1= (i1)+(i2);\ | |
3674 o2= (i1)-(i2); | |
3675 | |
3676 #define BUTTERFLY1(x,y) \ | |
3677 {\ | |
3678 int a,b;\ | |
3679 a= x;\ | |
3680 b= y;\ | |
3681 x= a+b;\ | |
3682 y= a-b;\ | |
3683 } | |
3684 | |
4001 | 3685 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y))) |
936 | 3686 |
1708 | 3687 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ |
936 | 3688 int i; |
3689 int temp[64]; | |
3690 int sum=0; | |
2967 | 3691 |
1708 | 3692 assert(h==8); |
936 | 3693 |
3694 for(i=0; i<8; i++){ | |
3695 //FIXME try pointer walks | |
3696 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]); | |
3697 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]); | |
3698 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]); | |
3699 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]); | |
2967 | 3700 |
936 | 3701 BUTTERFLY1(temp[8*i+0], temp[8*i+2]); |
3702 BUTTERFLY1(temp[8*i+1], temp[8*i+3]); | |
3703 BUTTERFLY1(temp[8*i+4], temp[8*i+6]); | |
3704 BUTTERFLY1(temp[8*i+5], temp[8*i+7]); | |
2967 | 3705 |
936 | 3706 BUTTERFLY1(temp[8*i+0], temp[8*i+4]); |
3707 BUTTERFLY1(temp[8*i+1], temp[8*i+5]); | |
3708 BUTTERFLY1(temp[8*i+2], temp[8*i+6]); | |
3709 BUTTERFLY1(temp[8*i+3], temp[8*i+7]); | |
3710 } | |
3711 | |
3712 for(i=0; i<8; i++){ | |
3713 BUTTERFLY1(temp[8*0+i], temp[8*1+i]); | |
3714 BUTTERFLY1(temp[8*2+i], temp[8*3+i]); | |
3715 BUTTERFLY1(temp[8*4+i], temp[8*5+i]); | |
3716 BUTTERFLY1(temp[8*6+i], temp[8*7+i]); | |
2967 | 3717 |
936 | 3718 BUTTERFLY1(temp[8*0+i], temp[8*2+i]); |
3719 BUTTERFLY1(temp[8*1+i], temp[8*3+i]); | |
3720 BUTTERFLY1(temp[8*4+i], temp[8*6+i]); | |
3721 BUTTERFLY1(temp[8*5+i], temp[8*7+i]); | |
3722 | |
2967 | 3723 sum += |
936 | 3724 BUTTERFLYA(temp[8*0+i], temp[8*4+i]) |
3725 +BUTTERFLYA(temp[8*1+i], temp[8*5+i]) | |
3726 +BUTTERFLYA(temp[8*2+i], temp[8*6+i]) | |
3727 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]); | |
3728 } | |
3729 #if 0 | |
3730 static int maxi=0; | |
3731 if(sum>maxi){ | |
3732 maxi=sum; | |
3733 printf("MAX:%d\n", maxi); | |
3734 } | |
3735 #endif | |
3736 return sum; | |
3737 } | |
3738 | |
1729 | 3739 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){ |
936 | 3740 int i; |
3741 int temp[64]; | |
3742 int sum=0; | |
2967 | 3743 |
1729 | 3744 assert(h==8); |
2967 | 3745 |
936 | 3746 for(i=0; i<8; i++){ |
3747 //FIXME try pointer walks | |
1729 | 3748 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]); |
3749 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]); | |
3750 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]); | |
3751 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]); | |
2967 | 3752 |
936 | 3753 BUTTERFLY1(temp[8*i+0], temp[8*i+2]); |
3754 BUTTERFLY1(temp[8*i+1], temp[8*i+3]); | |
3755 BUTTERFLY1(temp[8*i+4], temp[8*i+6]); | |
3756 BUTTERFLY1(temp[8*i+5], temp[8*i+7]); | |
2967 | 3757 |
936 | 3758 BUTTERFLY1(temp[8*i+0], temp[8*i+4]); |
3759 BUTTERFLY1(temp[8*i+1], temp[8*i+5]); | |
3760 BUTTERFLY1(temp[8*i+2], temp[8*i+6]); | |
3761 BUTTERFLY1(temp[8*i+3], temp[8*i+7]); | |
3762 } | |
3763 | |
3764 for(i=0; i<8; i++){ | |
3765 BUTTERFLY1(temp[8*0+i], temp[8*1+i]); | |
3766 BUTTERFLY1(temp[8*2+i], temp[8*3+i]); | |
3767 BUTTERFLY1(temp[8*4+i], temp[8*5+i]); | |
3768 BUTTERFLY1(temp[8*6+i], temp[8*7+i]); | |
2967 | 3769 |
936 | 3770 BUTTERFLY1(temp[8*0+i], temp[8*2+i]); |
3771 BUTTERFLY1(temp[8*1+i], temp[8*3+i]); | |
3772 BUTTERFLY1(temp[8*4+i], temp[8*6+i]); | |
3773 BUTTERFLY1(temp[8*5+i], temp[8*7+i]); | |
2967 | 3774 |
3775 sum += | |
936 | 3776 BUTTERFLYA(temp[8*0+i], temp[8*4+i]) |
3777 +BUTTERFLYA(temp[8*1+i], temp[8*5+i]) | |
3778 +BUTTERFLYA(temp[8*2+i], temp[8*6+i]) | |
3779 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]); | |
3780 } | |
2967 | 3781 |
4001 | 3782 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean |
2967 | 3783 |
936 | 3784 return sum; |
3785 } | |
3786 | |
1708 | 3787 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 3788 MpegEncContext * const s= (MpegEncContext *)c; |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
3789 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
1016 | 3790 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
2967 | 3791 |
1708 | 3792 assert(h==8); |
936 | 3793 |
3794 s->dsp.diff_pixels(temp, src1, src2, stride); | |
1092 | 3795 s->dsp.fdct(temp); |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
3796 return s->dsp.sum_abs_dctelem(temp); |
936 | 3797 } |
3798 | |
8590 | 3799 #if CONFIG_GPL |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3800 #define DCT8_1D {\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3801 const int s07 = SRC(0) + SRC(7);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3802 const int s16 = SRC(1) + SRC(6);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3803 const int s25 = SRC(2) + SRC(5);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3804 const int s34 = SRC(3) + SRC(4);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3805 const int a0 = s07 + s34;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3806 const int a1 = s16 + s25;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3807 const int a2 = s07 - s34;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3808 const int a3 = s16 - s25;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3809 const int d07 = SRC(0) - SRC(7);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3810 const int d16 = SRC(1) - SRC(6);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3811 const int d25 = SRC(2) - SRC(5);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3812 const int d34 = SRC(3) - SRC(4);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3813 const int a4 = d16 + d25 + (d07 + (d07>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3814 const int a5 = d07 - d34 - (d25 + (d25>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3815 const int a6 = d07 + d34 - (d16 + (d16>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3816 const int a7 = d16 - d25 + (d34 + (d34>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3817 DST(0, a0 + a1 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3818 DST(1, a4 + (a7>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3819 DST(2, a2 + (a3>>1)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3820 DST(3, a5 + (a6>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3821 DST(4, a0 - a1 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3822 DST(5, a6 - (a5>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3823 DST(6, (a2>>1) - a3 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3824 DST(7, (a4>>2) - a7 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3825 } |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3826 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3827 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3828 MpegEncContext * const s= (MpegEncContext *)c; |
5256 | 3829 DCTELEM dct[8][8]; |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3830 int i; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3831 int sum=0; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3832 |
5256 | 3833 s->dsp.diff_pixels(dct[0], src1, src2, stride); |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3834 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3835 #define SRC(x) dct[i][x] |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3836 #define DST(x,v) dct[i][x]= v |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3837 for( i = 0; i < 8; i++ ) |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3838 DCT8_1D |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3839 #undef SRC |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3840 #undef DST |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3841 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3842 #define SRC(x) dct[x][i] |
4001 | 3843 #define DST(x,v) sum += FFABS(v) |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3844 for( i = 0; i < 8; i++ ) |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3845 DCT8_1D |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3846 #undef SRC |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3847 #undef DST |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3848 return sum; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3849 } |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3850 #endif |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3851 |
2382 | 3852 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
3853 MpegEncContext * const s= (MpegEncContext *)c; | |
10094 | 3854 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
2382 | 3855 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3856 int sum=0, i; | |
2967 | 3857 |
2382 | 3858 assert(h==8); |
3859 | |
3860 s->dsp.diff_pixels(temp, src1, src2, stride); | |
3861 s->dsp.fdct(temp); | |
3862 | |
3863 for(i=0; i<64; i++) | |
4001 | 3864 sum= FFMAX(sum, FFABS(temp[i])); |
2967 | 3865 |
2382 | 3866 return sum; |
3867 } | |
3868 | |
1708 | 3869 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 3870 MpegEncContext * const s= (MpegEncContext *)c; |
10094 | 3871 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]); |
1016 | 3872 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3873 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64; | |
936 | 3874 int sum=0, i; |
3875 | |
1708 | 3876 assert(h==8); |
936 | 3877 s->mb_intra=0; |
2967 | 3878 |
936 | 3879 s->dsp.diff_pixels(temp, src1, src2, stride); |
2967 | 3880 |
936 | 3881 memcpy(bak, temp, 64*sizeof(DCTELEM)); |
2967 | 3882 |
1013 | 3883 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
1689 | 3884 s->dct_unquantize_inter(s, temp, 0, s->qscale); |
6001 | 3885 ff_simple_idct(temp); //FIXME |
2967 | 3886 |
936 | 3887 for(i=0; i<64; i++) |
3888 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]); | |
2967 | 3889 |
936 | 3890 return sum; |
3891 } | |
3892 | |
1708 | 3893 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1007 | 3894 MpegEncContext * const s= (MpegEncContext *)c; |
1064 | 3895 const uint8_t *scantable= s->intra_scantable.permutated; |
10094 | 3896 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
3897 DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]); | |
3898 DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]); | |
1016 | 3899 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
10068 | 3900 uint8_t * const lsrc1 = (uint8_t*)aligned_src1; |
3901 uint8_t * const lsrc2 = (uint8_t*)aligned_src2; | |
6719 | 3902 int i, last, run, bits, level, distortion, start_i; |
1007 | 3903 const int esc_length= s->ac_esc_length; |
3904 uint8_t * length; | |
3905 uint8_t * last_length; | |
2967 | 3906 |
1708 | 3907 assert(h==8); |
3908 | |
10068 | 3909 copy_block8(lsrc1, src1, 8, stride, 8); |
3910 copy_block8(lsrc2, src2, 8, stride, 8); | |
3911 | |
3912 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8); | |
1007 | 3913 |
1013 | 3914 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
3915 | |
3916 bits=0; | |
2967 | 3917 |
1013 | 3918 if (s->mb_intra) { |
2967 | 3919 start_i = 1; |
1013 | 3920 length = s->intra_ac_vlc_length; |
3921 last_length= s->intra_ac_vlc_last_length; | |
3922 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma | |
3923 } else { | |
3924 start_i = 0; | |
3925 length = s->inter_ac_vlc_length; | |
3926 last_length= s->inter_ac_vlc_last_length; | |
3927 } | |
2967 | 3928 |
1013 | 3929 if(last>=start_i){ |
1007 | 3930 run=0; |
3931 for(i=start_i; i<last; i++){ | |
3932 int j= scantable[i]; | |
3933 level= temp[j]; | |
2967 | 3934 |
1007 | 3935 if(level){ |
3936 level+=64; | |
3937 if((level&(~127)) == 0){ | |
3938 bits+= length[UNI_AC_ENC_INDEX(run, level)]; | |
3939 }else | |
3940 bits+= esc_length; | |
3941 run=0; | |
3942 }else | |
3943 run++; | |
3944 } | |
3945 i= scantable[last]; | |
2967 | 3946 |
1011 | 3947 level= temp[i] + 64; |
3948 | |
3949 assert(level - 64); | |
2967 | 3950 |
1007 | 3951 if((level&(~127)) == 0){ |
3952 bits+= last_length[UNI_AC_ENC_INDEX(run, level)]; | |
3953 }else | |
3954 bits+= esc_length; | |
2967 | 3955 |
1013 | 3956 } |
3957 | |
3958 if(last>=0){ | |
1689 | 3959 if(s->mb_intra) |
3960 s->dct_unquantize_intra(s, temp, 0, s->qscale); | |
3961 else | |
3962 s->dct_unquantize_inter(s, temp, 0, s->qscale); | |
1007 | 3963 } |
2967 | 3964 |
10068 | 3965 s->dsp.idct_add(lsrc2, 8, temp); |
3966 | |
3967 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8); | |
6719 | 3968 |
3969 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7); | |
1007 | 3970 } |
3971 | |
1708 | 3972 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1007 | 3973 MpegEncContext * const s= (MpegEncContext *)c; |
1064 | 3974 const uint8_t *scantable= s->intra_scantable.permutated; |
10094 | 3975 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]); |
1016 | 3976 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
1007 | 3977 int i, last, run, bits, level, start_i; |
3978 const int esc_length= s->ac_esc_length; | |
3979 uint8_t * length; | |
3980 uint8_t * last_length; | |
1708 | 3981 |
3982 assert(h==8); | |
2967 | 3983 |
1013 | 3984 s->dsp.diff_pixels(temp, src1, src2, stride); |
1007 | 3985 |
1013 | 3986 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
3987 | |
3988 bits=0; | |
2967 | 3989 |
1007 | 3990 if (s->mb_intra) { |
2967 | 3991 start_i = 1; |
1007 | 3992 length = s->intra_ac_vlc_length; |
3993 last_length= s->intra_ac_vlc_last_length; | |
1013 | 3994 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma |
1007 | 3995 } else { |
3996 start_i = 0; | |
3997 length = s->inter_ac_vlc_length; | |
3998 last_length= s->inter_ac_vlc_last_length; | |
3999 } | |
2967 | 4000 |
1013 | 4001 if(last>=start_i){ |
1007 | 4002 run=0; |
4003 for(i=start_i; i<last; i++){ | |
4004 int j= scantable[i]; | |
4005 level= temp[j]; | |
2967 | 4006 |
1007 | 4007 if(level){ |
4008 level+=64; | |
4009 if((level&(~127)) == 0){ | |
4010 bits+= length[UNI_AC_ENC_INDEX(run, level)]; | |
4011 }else | |
4012 bits+= esc_length; | |
4013 run=0; | |
4014 }else | |
4015 run++; | |
4016 } | |
4017 i= scantable[last]; | |
2967 | 4018 |
1013 | 4019 level= temp[i] + 64; |
2967 | 4020 |
1013 | 4021 assert(level - 64); |
2967 | 4022 |
1007 | 4023 if((level&(~127)) == 0){ |
4024 bits+= last_length[UNI_AC_ENC_INDEX(run, level)]; | |
4025 }else | |
4026 bits+= esc_length; | |
4027 } | |
4028 | |
4029 return bits; | |
4030 } | |
4031 | |
8978 | 4032 #define VSAD_INTRA(size) \ |
4033 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \ | |
4034 int score=0; \ | |
4035 int x,y; \ | |
4036 \ | |
4037 for(y=1; y<h; y++){ \ | |
4038 for(x=0; x<size; x+=4){ \ | |
4039 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \ | |
4040 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \ | |
4041 } \ | |
4042 s+= stride; \ | |
4043 } \ | |
4044 \ | |
4045 return score; \ | |
1729 | 4046 } |
8978 | 4047 VSAD_INTRA(8) |
4048 VSAD_INTRA(16) | |
1729 | 4049 |
4050 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){ | |
4051 int score=0; | |
4052 int x,y; | |
2967 | 4053 |
1729 | 4054 for(y=1; y<h; y++){ |
4055 for(x=0; x<16; x++){ | |
4001 | 4056 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]); |
1729 | 4057 } |
4058 s1+= stride; | |
4059 s2+= stride; | |
4060 } | |
2967 | 4061 |
1729 | 4062 return score; |
4063 } | |
4064 | |
4065 #define SQ(a) ((a)*(a)) | |
8978 | 4066 #define VSSE_INTRA(size) \ |
4067 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \ | |
4068 int score=0; \ | |
4069 int x,y; \ | |
4070 \ | |
4071 for(y=1; y<h; y++){ \ | |
4072 for(x=0; x<size; x+=4){ \ | |
4073 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \ | |
4074 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \ | |
4075 } \ | |
4076 s+= stride; \ | |
4077 } \ | |
4078 \ | |
4079 return score; \ | |
1729 | 4080 } |
8978 | 4081 VSSE_INTRA(8) |
4082 VSSE_INTRA(16) | |
1729 | 4083 |
4084 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){ | |
4085 int score=0; | |
4086 int x,y; | |
2967 | 4087 |
1729 | 4088 for(y=1; y<h; y++){ |
4089 for(x=0; x<16; x++){ | |
4090 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]); | |
4091 } | |
4092 s1+= stride; | |
4093 s2+= stride; | |
4094 } | |
2967 | 4095 |
1729 | 4096 return score; |
4097 } | |
4098 | |
5255 | 4099 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2, |
4100 int size){ | |
4749 | 4101 int score=0; |
4102 int i; | |
4103 for(i=0; i<size; i++) | |
4104 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); | |
4105 return score; | |
4106 } | |
4107 | |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4108 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4109 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4110 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) |
8590 | 4111 #if CONFIG_GPL |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4112 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) |
3013 | 4113 #endif |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4114 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4115 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4116 WRAPPER8_16_SQ(rd8x8_c, rd16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4117 WRAPPER8_16_SQ(bit8x8_c, bit16_c) |
936 | 4118 |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4119 static void vector_fmul_c(float *dst, const float *src, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4120 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4121 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4122 dst[i] *= src[i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4123 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4124 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4125 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4126 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4127 src1 += len-1; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4128 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4129 dst[i] = src0[i] * src1[-i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4130 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4131 |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4132 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4133 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4134 for(i=0; i<len; i++) |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4135 dst[i] = src0[i] * src1[i] + src2[i]; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4136 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4137 |
7261 | 4138 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){ |
7263 | 4139 int i,j; |
4140 dst += len; | |
4141 win += len; | |
4142 src0+= len; | |
4143 for(i=-len, j=len-1; i<0; i++, j--) { | |
4144 float s0 = src0[i]; | |
4145 float s1 = src1[j]; | |
4146 float wi = win[i]; | |
4147 float wj = win[j]; | |
4148 dst[i] = s0*wj - s1*wi + add_bias; | |
4149 dst[j] = s0*wi + s1*wj + add_bias; | |
4150 } | |
7261 | 4151 } |
4152 | |
10219 | 4153 static void vector_fmul_scalar_c(float *dst, const float *src, float mul, |
4154 int len) | |
4155 { | |
4156 int i; | |
4157 for (i = 0; i < len; i++) | |
4158 dst[i] = src[i] * mul; | |
4159 } | |
4160 | |
4161 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src, | |
4162 const float **sv, float mul, int len) | |
4163 { | |
4164 int i; | |
4165 for (i = 0; i < len; i += 2, sv++) { | |
4166 dst[i ] = src[i ] * sv[0][0] * mul; | |
4167 dst[i+1] = src[i+1] * sv[0][1] * mul; | |
4168 } | |
4169 } | |
4170 | |
4171 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src, | |
4172 const float **sv, float mul, int len) | |
4173 { | |
4174 int i; | |
4175 for (i = 0; i < len; i += 4, sv++) { | |
4176 dst[i ] = src[i ] * sv[0][0] * mul; | |
4177 dst[i+1] = src[i+1] * sv[0][1] * mul; | |
4178 dst[i+2] = src[i+2] * sv[0][2] * mul; | |
4179 dst[i+3] = src[i+3] * sv[0][3] * mul; | |
4180 } | |
4181 } | |
4182 | |
4183 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul, | |
4184 int len) | |
4185 { | |
4186 int i; | |
4187 for (i = 0; i < len; i += 2, sv++) { | |
4188 dst[i ] = sv[0][0] * mul; | |
4189 dst[i+1] = sv[0][1] * mul; | |
4190 } | |
4191 } | |
4192 | |
4193 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul, | |
4194 int len) | |
4195 { | |
4196 int i; | |
4197 for (i = 0; i < len; i += 4, sv++) { | |
4198 dst[i ] = sv[0][0] * mul; | |
4199 dst[i+1] = sv[0][1] * mul; | |
4200 dst[i+2] = sv[0][2] * mul; | |
4201 dst[i+3] = sv[0][3] * mul; | |
4202 } | |
4203 } | |
4204 | |
4205 static void butterflies_float_c(float *restrict v1, float *restrict v2, | |
4206 int len) | |
4207 { | |
4208 int i; | |
4209 for (i = 0; i < len; i++) { | |
4210 float t = v1[i] - v2[i]; | |
4211 v1[i] += v2[i]; | |
4212 v2[i] = t; | |
4213 } | |
4214 } | |
4215 | |
4216 static float scalarproduct_float_c(const float *v1, const float *v2, int len) | |
4217 { | |
4218 float p = 0.0; | |
4219 int i; | |
4220 | |
4221 for (i = 0; i < len; i++) | |
4222 p += v1[i] * v2[i]; | |
4223 | |
4224 return p; | |
4225 } | |
4226 | |
7564 | 4227 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ |
4228 int i; | |
4229 for(i=0; i<len; i++) | |
4230 dst[i] = src[i] * mul; | |
4231 } | |
4232 | |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4233 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4234 uint32_t maxi, uint32_t maxisign) |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4235 { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4236 |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4237 if(a > mini) return mini; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4238 else if((a^(1<<31)) > maxisign) return maxi; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4239 else return a; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4240 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4241 |
10105 | 4242 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){ |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4243 int i; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4244 uint32_t mini = *(uint32_t*)min; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4245 uint32_t maxi = *(uint32_t*)max; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4246 uint32_t maxisign = maxi ^ (1<<31); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4247 uint32_t *dsti = (uint32_t*)dst; |
10105 | 4248 const uint32_t *srci = (const uint32_t*)src; |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4249 for(i=0; i<len; i+=8) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4250 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4251 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4252 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4253 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4254 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4255 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4256 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4257 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4258 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4259 } |
10105 | 4260 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){ |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4261 int i; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4262 if(min < 0 && max > 0) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4263 vector_clipf_c_opposite_sign(dst, src, &min, &max, len); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4264 } else { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4265 for(i=0; i < len; i+=8) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4266 dst[i ] = av_clipf(src[i ], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4267 dst[i + 1] = av_clipf(src[i + 1], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4268 dst[i + 2] = av_clipf(src[i + 2], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4269 dst[i + 3] = av_clipf(src[i + 3], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4270 dst[i + 4] = av_clipf(src[i + 4], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4271 dst[i + 5] = av_clipf(src[i + 5], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4272 dst[i + 6] = av_clipf(src[i + 6], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4273 dst[i + 7] = av_clipf(src[i + 7], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4274 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4275 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4276 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4277 |
7261 | 4278 static av_always_inline int float_to_int16_one(const float *src){ |
4279 int_fast32_t tmp = *(const int32_t*)src; | |
4280 if(tmp & 0xf0000){ | |
4281 tmp = (0x43c0ffff - tmp)>>31; | |
4282 // is this faster on some gcc/cpu combinations? | |
4283 // if(tmp > 0x43c0ffff) tmp = 0xFFFF; | |
4284 // else tmp = 0; | |
4285 } | |
4286 return tmp - 0x8000; | |
4287 } | |
4288 | |
7218 | 4289 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4290 int i; |
7261 | 4291 for(i=0; i<len; i++) |
4292 dst[i] = float_to_int16_one(src+i); | |
4293 } | |
4294 | |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4295 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){ |
7261 | 4296 int i,j,c; |
4297 if(channels==2){ | |
4298 for(i=0; i<len; i++){ | |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4299 dst[2*i] = float_to_int16_one(src[0]+i); |
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4300 dst[2*i+1] = float_to_int16_one(src[1]+i); |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4301 } |
7261 | 4302 }else{ |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4303 for(c=0; c<channels; c++) |
7261 | 4304 for(i=0, j=c; i<len; i++, j+=channels) |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4305 dst[j] = float_to_int16_one(src[c]+i); |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4306 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4307 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4308 |
7203
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4309 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4310 { |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4311 int res = 0; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4312 |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4313 while (order--) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4314 res += (*v1++ * *v2++) >> shift; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4315 |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4316 return res; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4317 } |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4318 |
10644 | 4319 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
4320 { | |
4321 int res = 0; | |
4322 while (order--) { | |
4323 res += *v1 * *v2++; | |
4324 *v1++ += mul * *v3++; | |
4325 } | |
4326 return res; | |
4327 } | |
4328 | |
5887 | 4329 #define W0 2048 |
4330 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | |
4331 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ | |
4332 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ | |
4333 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ | |
4334 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ | |
4335 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ | |
4336 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ | |
4337 | |
4338 static void wmv2_idct_row(short * b) | |
4339 { | |
4340 int s1,s2; | |
4341 int a0,a1,a2,a3,a4,a5,a6,a7; | |
4342 /*step 1*/ | |
4343 a1 = W1*b[1]+W7*b[7]; | |
4344 a7 = W7*b[1]-W1*b[7]; | |
4345 a5 = W5*b[5]+W3*b[3]; | |
4346 a3 = W3*b[5]-W5*b[3]; | |
4347 a2 = W2*b[2]+W6*b[6]; | |
4348 a6 = W6*b[2]-W2*b[6]; | |
4349 a0 = W0*b[0]+W0*b[4]; | |
4350 a4 = W0*b[0]-W0*b[4]; | |
4351 /*step 2*/ | |
4352 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7, | |
4353 s2 = (181*(a1-a5-a7+a3)+128)>>8; | |
4354 /*step 3*/ | |
4355 b[0] = (a0+a2+a1+a5 + (1<<7))>>8; | |
4356 b[1] = (a4+a6 +s1 + (1<<7))>>8; | |
4357 b[2] = (a4-a6 +s2 + (1<<7))>>8; | |
4358 b[3] = (a0-a2+a7+a3 + (1<<7))>>8; | |
4359 b[4] = (a0-a2-a7-a3 + (1<<7))>>8; | |
4360 b[5] = (a4-a6 -s2 + (1<<7))>>8; | |
4361 b[6] = (a4+a6 -s1 + (1<<7))>>8; | |
4362 b[7] = (a0+a2-a1-a5 + (1<<7))>>8; | |
4363 } | |
4364 static void wmv2_idct_col(short * b) | |
4365 { | |
4366 int s1,s2; | |
4367 int a0,a1,a2,a3,a4,a5,a6,a7; | |
4368 /*step 1, with extended precision*/ | |
4369 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3; | |
4370 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3; | |
4371 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3; | |
4372 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3; | |
4373 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3; | |
4374 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3; | |
4375 a0 = (W0*b[8*0]+W0*b[8*4] )>>3; | |
4376 a4 = (W0*b[8*0]-W0*b[8*4] )>>3; | |
4377 /*step 2*/ | |
4378 s1 = (181*(a1-a5+a7-a3)+128)>>8; | |
4379 s2 = (181*(a1-a5-a7+a3)+128)>>8; | |
4380 /*step 3*/ | |
4381 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14; | |
4382 b[8*1] = (a4+a6 +s1 + (1<<13))>>14; | |
4383 b[8*2] = (a4-a6 +s2 + (1<<13))>>14; | |
4384 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14; | |
4385 | |
4386 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14; | |
4387 b[8*5] = (a4-a6 -s2 + (1<<13))>>14; | |
4388 b[8*6] = (a4+a6 -s1 + (1<<13))>>14; | |
4389 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14; | |
4390 } | |
4391 void ff_wmv2_idct_c(short * block){ | |
4392 int i; | |
4393 | |
4394 for(i=0;i<64;i+=8){ | |
4395 wmv2_idct_row(block+i); | |
4396 } | |
4397 for(i=0;i<8;i++){ | |
4398 wmv2_idct_col(block+i); | |
4399 } | |
4400 } | |
1092 | 4401 /* XXX: those functions should be suppressed ASAP when all IDCTs are |
4402 converted */ | |
5887 | 4403 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block) |
4404 { | |
4405 ff_wmv2_idct_c(block); | |
4406 put_pixels_clamped_c(block, dest, line_size); | |
4407 } | |
4408 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block) | |
4409 { | |
4410 ff_wmv2_idct_c(block); | |
4411 add_pixels_clamped_c(block, dest, line_size); | |
4412 } | |
1092 | 4413 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
4414 { | |
4415 j_rev_dct (block); | |
4416 put_pixels_clamped_c(block, dest, line_size); | |
4417 } | |
4418 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4419 { | |
4420 j_rev_dct (block); | |
4421 add_pixels_clamped_c(block, dest, line_size); | |
4422 } | |
4423 | |
2256 | 4424 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block) |
4425 { | |
4426 j_rev_dct4 (block); | |
4427 put_pixels_clamped4_c(block, dest, line_size); | |
4428 } | |
4429 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4430 { | |
4431 j_rev_dct4 (block); | |
4432 add_pixels_clamped4_c(block, dest, line_size); | |
4433 } | |
4434 | |
2257 | 4435 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block) |
4436 { | |
4437 j_rev_dct2 (block); | |
4438 put_pixels_clamped2_c(block, dest, line_size); | |
4439 } | |
4440 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4441 { | |
4442 j_rev_dct2 (block); | |
4443 add_pixels_clamped2_c(block, dest, line_size); | |
4444 } | |
4445 | |
2259 | 4446 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block) |
4447 { | |
4176 | 4448 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2259 | 4449 |
4450 dest[0] = cm[(block[0] + 4)>>3]; | |
4451 } | |
4452 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4453 { | |
4176 | 4454 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2259 | 4455 |
4456 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)]; | |
4457 } | |
4458 | |
5143 | 4459 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; } |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4460 |
1201 | 4461 /* init static data */ |
10867 | 4462 av_cold void dsputil_static_init(void) |
0 | 4463 { |
751 | 4464 int i; |
0 | 4465 |
4176 | 4466 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; |
1201 | 4467 for(i=0;i<MAX_NEG_CROP;i++) { |
4176 | 4468 ff_cropTbl[i] = 0; |
4469 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255; | |
1201 | 4470 } |
2967 | 4471 |
1201 | 4472 for(i=0;i<512;i++) { |
4179 | 4473 ff_squareTbl[i] = (i - 256) * (i - 256); |
1201 | 4474 } |
2967 | 4475 |
4197 | 4476 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1; |
1201 | 4477 } |
0 | 4478 |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4479 int ff_check_alignment(void){ |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4480 static int did_fail=0; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4481 DECLARE_ALIGNED_16(int, aligned); |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4482 |
9259 | 4483 if((intptr_t)&aligned & 15){ |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4484 if(!did_fail){ |
8590 | 4485 #if HAVE_MMX || HAVE_ALTIVEC |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4486 av_log(NULL, AV_LOG_ERROR, |
4292 | 4487 "Compiler did not align stack variables. Libavcodec has been miscompiled\n" |
4488 "and may be very slow or crash. This is not a bug in libavcodec,\n" | |
5542
b0a566346fb1
Add attribute that forces alignment of stack to functions that need it.
ramiro
parents:
5520
diff
changeset
|
4489 "but in the compiler. You may try recompiling using gcc >= 4.2.\n" |
b0a566346fb1
Add attribute that forces alignment of stack to functions that need it.
ramiro
parents:
5520
diff
changeset
|
4490 "Do not report crashes to FFmpeg developers.\n"); |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4491 #endif |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4492 did_fail=1; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4493 } |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4494 return -1; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4495 } |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4496 return 0; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4497 } |
861 | 4498 |
10867 | 4499 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) |
1201 | 4500 { |
4501 int i; | |
0 | 4502 |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4503 ff_check_alignment(); |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4504 |
8590 | 4505 #if CONFIG_ENCODERS |
1567 | 4506 if(avctx->dct_algo==FF_DCT_FASTINT) { |
1092 | 4507 c->fdct = fdct_ifast; |
2979 | 4508 c->fdct248 = fdct_ifast248; |
2967 | 4509 } |
1567 | 4510 else if(avctx->dct_algo==FF_DCT_FAAN) { |
1557 | 4511 c->fdct = ff_faandct; |
2979 | 4512 c->fdct248 = ff_faandct248; |
2967 | 4513 } |
1567 | 4514 else { |
1092 | 4515 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default |
2979 | 4516 c->fdct248 = ff_fdct248_islow; |
1567 | 4517 } |
1092 | 4518 #endif //CONFIG_ENCODERS |
4519 | |
2256 | 4520 if(avctx->lowres==1){ |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4521 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){ |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4522 c->idct_put= ff_jref_idct4_put; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4523 c->idct_add= ff_jref_idct4_add; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4524 }else{ |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4525 c->idct_put= ff_h264_lowres_idct_put_c; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4526 c->idct_add= ff_h264_lowres_idct_add_c; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4527 } |
2256 | 4528 c->idct = j_rev_dct4; |
1092 | 4529 c->idct_permutation_type= FF_NO_IDCT_PERM; |
2257 | 4530 }else if(avctx->lowres==2){ |
4531 c->idct_put= ff_jref_idct2_put; | |
4532 c->idct_add= ff_jref_idct2_add; | |
4533 c->idct = j_rev_dct2; | |
4534 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2259 | 4535 }else if(avctx->lowres==3){ |
4536 c->idct_put= ff_jref_idct1_put; | |
4537 c->idct_add= ff_jref_idct1_add; | |
4538 c->idct = j_rev_dct1; | |
4539 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2256 | 4540 }else{ |
4541 if(avctx->idct_algo==FF_IDCT_INT){ | |
4542 c->idct_put= ff_jref_idct_put; | |
4543 c->idct_add= ff_jref_idct_add; | |
4544 c->idct = j_rev_dct; | |
4545 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9586
diff
changeset
|
4546 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) && |
5007 | 4547 avctx->idct_algo==FF_IDCT_VP3){ |
2693 | 4548 c->idct_put= ff_vp3_idct_put_c; |
4549 c->idct_add= ff_vp3_idct_add_c; | |
4550 c->idct = ff_vp3_idct_c; | |
4551 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
5887 | 4552 }else if(avctx->idct_algo==FF_IDCT_WMV2){ |
4553 c->idct_put= ff_wmv2_idct_put_c; | |
4554 c->idct_add= ff_wmv2_idct_add_c; | |
4555 c->idct = ff_wmv2_idct_c; | |
4556 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
6407 | 4557 }else if(avctx->idct_algo==FF_IDCT_FAAN){ |
4558 c->idct_put= ff_faanidct_put; | |
4559 c->idct_add= ff_faanidct_add; | |
4560 c->idct = ff_faanidct; | |
4561 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4562 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) { |
8120 | 4563 c->idct_put= ff_ea_idct_put_c; |
4564 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2256 | 4565 }else{ //accurate/default |
6001 | 4566 c->idct_put= ff_simple_idct_put; |
4567 c->idct_add= ff_simple_idct_add; | |
4568 c->idct = ff_simple_idct; | |
2256 | 4569 c->idct_permutation_type= FF_NO_IDCT_PERM; |
4570 } | |
1092 | 4571 } |
4572 | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4573 if (CONFIG_H264_DECODER) { |
5065 | 4574 c->h264_idct_add= ff_h264_idct_add_c; |
4575 c->h264_idct8_add= ff_h264_idct8_add_c; | |
4576 c->h264_idct_dc_add= ff_h264_idct_dc_add_c; | |
4577 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; | |
8375
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4578 c->h264_idct_add16 = ff_h264_idct_add16_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4579 c->h264_idct8_add4 = ff_h264_idct8_add4_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4580 c->h264_idct_add8 = ff_h264_idct_add8_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4581 c->h264_idct_add16intra= ff_h264_idct_add16intra_c; |
5064 | 4582 } |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4583 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4584 c->get_pixels = get_pixels_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4585 c->diff_pixels = diff_pixels_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4586 c->put_pixels_clamped = put_pixels_clamped_c; |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
4587 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4588 c->add_pixels_clamped = add_pixels_clamped_c; |
2763 | 4589 c->add_pixels8 = add_pixels8_c; |
4590 c->add_pixels4 = add_pixels4_c; | |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
4591 c->sum_abs_dctelem = sum_abs_dctelem_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4592 c->gmc1 = gmc1_c; |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
4593 c->gmc = ff_gmc_c; |
8288 | 4594 c->clear_block = clear_block_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4595 c->clear_blocks = clear_blocks_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4596 c->pix_sum = pix_sum_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4597 c->pix_norm1 = pix_norm1_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4598 |
859 | 4599 /* TODO [0] 16 [1] 8 */ |
1708 | 4600 c->pix_abs[0][0] = pix_abs16_c; |
4601 c->pix_abs[0][1] = pix_abs16_x2_c; | |
4602 c->pix_abs[0][2] = pix_abs16_y2_c; | |
4603 c->pix_abs[0][3] = pix_abs16_xy2_c; | |
4604 c->pix_abs[1][0] = pix_abs8_c; | |
4605 c->pix_abs[1][1] = pix_abs8_x2_c; | |
4606 c->pix_abs[1][2] = pix_abs8_y2_c; | |
4607 c->pix_abs[1][3] = pix_abs8_xy2_c; | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4608 |
859 | 4609 #define dspfunc(PFX, IDX, NUM) \ |
4610 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \ | |
4611 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \ | |
4612 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \ | |
4613 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4614 |
859 | 4615 dspfunc(put, 0, 16); |
4616 dspfunc(put_no_rnd, 0, 16); | |
4617 dspfunc(put, 1, 8); | |
4618 dspfunc(put_no_rnd, 1, 8); | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4619 dspfunc(put, 2, 4); |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4620 dspfunc(put, 3, 2); |
0 | 4621 |
859 | 4622 dspfunc(avg, 0, 16); |
4623 dspfunc(avg_no_rnd, 0, 16); | |
4624 dspfunc(avg, 1, 8); | |
4625 dspfunc(avg_no_rnd, 1, 8); | |
1319 | 4626 dspfunc(avg, 2, 4); |
4627 dspfunc(avg, 3, 2); | |
859 | 4628 #undef dspfunc |
857 | 4629 |
1864 | 4630 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c; |
4631 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c; | |
4632 | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4633 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4634 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4635 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4636 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4637 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4638 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4639 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4640 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4641 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4642 |
1319 | 4643 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c; |
4644 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c; | |
4645 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c; | |
4646 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c; | |
4647 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c; | |
4648 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c; | |
4649 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c; | |
4650 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c; | |
4651 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c; | |
4652 | |
859 | 4653 #define dspfunc(PFX, IDX, NUM) \ |
4654 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \ | |
4655 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \ | |
4656 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \ | |
4657 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \ | |
4658 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \ | |
4659 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \ | |
4660 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \ | |
4661 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \ | |
4662 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \ | |
4663 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \ | |
4664 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \ | |
4665 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \ | |
4666 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \ | |
4667 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \ | |
4668 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \ | |
4669 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c | |
857 | 4670 |
859 | 4671 dspfunc(put_qpel, 0, 16); |
4672 dspfunc(put_no_rnd_qpel, 0, 16); | |
4673 | |
4674 dspfunc(avg_qpel, 0, 16); | |
4675 /* dspfunc(avg_no_rnd_qpel, 0, 16); */ | |
857 | 4676 |
859 | 4677 dspfunc(put_qpel, 1, 8); |
4678 dspfunc(put_no_rnd_qpel, 1, 8); | |
4679 | |
4680 dspfunc(avg_qpel, 1, 8); | |
4681 /* dspfunc(avg_no_rnd_qpel, 1, 8); */ | |
1168 | 4682 |
4683 dspfunc(put_h264_qpel, 0, 16); | |
4684 dspfunc(put_h264_qpel, 1, 8); | |
4685 dspfunc(put_h264_qpel, 2, 4); | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
4686 dspfunc(put_h264_qpel, 3, 2); |
1168 | 4687 dspfunc(avg_h264_qpel, 0, 16); |
4688 dspfunc(avg_h264_qpel, 1, 8); | |
4689 dspfunc(avg_h264_qpel, 2, 4); | |
4690 | |
859 | 4691 #undef dspfunc |
1168 | 4692 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c; |
4693 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c; | |
4694 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c; | |
4695 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c; | |
4696 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c; | |
4697 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c; | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9437
diff
changeset
|
4698 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c; |
9440 | 4699 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c; |
857 | 4700 |
2415 | 4701 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; |
4702 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | |
4703 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; | |
4704 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; | |
4705 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; | |
4706 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; | |
4707 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; | |
4708 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; | |
4709 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; | |
4710 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; | |
4711 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; | |
4712 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; | |
4713 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; | |
4714 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; | |
4715 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; | |
4716 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; | |
4717 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; | |
4718 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; | |
4719 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; | |
4720 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; | |
4721 | |
6437 | 4722 c->draw_edges = draw_edges_c; |
4723 | |
8590 | 4724 #if CONFIG_CAVS_DECODER |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
4725 ff_cavsdsp_init(c,avctx); |
3432 | 4726 #endif |
9585 | 4727 |
4728 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER | |
4729 ff_mlp_init(c, avctx); | |
4730 #endif | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
4731 #if CONFIG_VC1_DECODER |
3526 | 4732 ff_vc1dsp_init(c,avctx); |
4733 #endif | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
4734 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER |
5887 | 4735 ff_intrax8dsp_init(c,avctx); |
4736 #endif | |
8590 | 4737 #if CONFIG_RV30_DECODER |
8410 | 4738 ff_rv30dsp_init(c,avctx); |
4739 #endif | |
8590 | 4740 #if CONFIG_RV40_DECODER |
8232 | 4741 ff_rv40dsp_init(c,avctx); |
4742 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c; | |
4743 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c; | |
4744 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c; | |
4745 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c; | |
4746 #endif | |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
4747 |
936 | 4748 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c; |
4749 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c; | |
4750 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c; | |
4751 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c; | |
4752 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c; | |
4753 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c; | |
4754 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c; | |
4755 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c; | |
2967 | 4756 |
1708 | 4757 #define SET_CMP_FUNC(name) \ |
4758 c->name[0]= name ## 16_c;\ | |
4759 c->name[1]= name ## 8x8_c; | |
2967 | 4760 |
1708 | 4761 SET_CMP_FUNC(hadamard8_diff) |
1729 | 4762 c->hadamard8_diff[4]= hadamard8_intra16_c; |
8978 | 4763 c->hadamard8_diff[5]= hadamard8_intra8x8_c; |
1708 | 4764 SET_CMP_FUNC(dct_sad) |
2382 | 4765 SET_CMP_FUNC(dct_max) |
8590 | 4766 #if CONFIG_GPL |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
4767 SET_CMP_FUNC(dct264_sad) |
3013 | 4768 #endif |
1708 | 4769 c->sad[0]= pix_abs16_c; |
4770 c->sad[1]= pix_abs8_c; | |
4771 c->sse[0]= sse16_c; | |
4772 c->sse[1]= sse8_c; | |
2184 | 4773 c->sse[2]= sse4_c; |
1708 | 4774 SET_CMP_FUNC(quant_psnr) |
4775 SET_CMP_FUNC(rd) | |
4776 SET_CMP_FUNC(bit) | |
1729 | 4777 c->vsad[0]= vsad16_c; |
4778 c->vsad[4]= vsad_intra16_c; | |
8978 | 4779 c->vsad[5]= vsad_intra8_c; |
1729 | 4780 c->vsse[0]= vsse16_c; |
4781 c->vsse[4]= vsse_intra16_c; | |
8978 | 4782 c->vsse[5]= vsse_intra8_c; |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
4783 c->nsse[0]= nsse16_c; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
4784 c->nsse[1]= nsse8_c; |
8590 | 4785 #if CONFIG_SNOW_ENCODER |
2184 | 4786 c->w53[0]= w53_16_c; |
4787 c->w53[1]= w53_8_c; | |
4788 c->w97[0]= w97_16_c; | |
4789 c->w97[1]= w97_8_c; | |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
4790 #endif |
2184 | 4791 |
4749 | 4792 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c; |
4793 | |
866 | 4794 c->add_bytes= add_bytes_c; |
6384 | 4795 c->add_bytes_l2= add_bytes_l2_c; |
866 | 4796 c->diff_bytes= diff_bytes_c; |
8760 | 4797 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c; |
1527 | 4798 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c; |
10370 | 4799 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c; |
4800 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c; | |
1273 | 4801 c->bswap_buf= bswap_buf; |
8590 | 4802 #if CONFIG_PNG_DECODER |
6384 | 4803 c->add_png_paeth_prediction= ff_add_png_paeth_prediction; |
4804 #endif | |
2633 | 4805 |
4806 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; | |
4807 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; | |
8395
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
4808 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
4809 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; |
2633 | 4810 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; |
4811 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
4812 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
4813 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; |
3645
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3568
diff
changeset
|
4814 c->h264_loop_filter_strength= NULL; |
2967 | 4815 |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10748
diff
changeset
|
4816 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
5278 | 4817 c->h263_h_loop_filter= h263_h_loop_filter_c; |
4818 c->h263_v_loop_filter= h263_v_loop_filter_c; | |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5256
diff
changeset
|
4819 } |
2967 | 4820 |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9586
diff
changeset
|
4821 if (CONFIG_VP3_DECODER) { |
7995 | 4822 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c; |
4823 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c; | |
4824 } | |
8785
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4825 if (CONFIG_VP6_DECODER) { |
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4826 c->vp6_filter_diag4= ff_vp6_filter_diag4_c; |
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4827 } |
7995 | 4828 |
2045 | 4829 c->h261_loop_filter= h261_loop_filter_c; |
2967 | 4830 |
1784 | 4831 c->try_8x8basis= try_8x8basis_c; |
4832 c->add_8x8basis= add_8x8basis_c; | |
866 | 4833 |
8590 | 4834 #if CONFIG_SNOW_DECODER |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4835 c->vertical_compose97i = ff_snow_vertical_compose97i; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4836 c->horizontal_compose97i = ff_snow_horizontal_compose97i; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4837 c->inner_add_yblock = ff_snow_inner_add_yblock; |
3199
1651e69b9f7a
10l: Only set *compose97i *add_yblock to dsputils context if we are building with Snow enabled
gpoirier
parents:
3198
diff
changeset
|
4838 #endif |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4839 |
8590 | 4840 #if CONFIG_VORBIS_DECODER |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4841 c->vorbis_inverse_coupling = vorbis_inverse_coupling; |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4842 #endif |
8590 | 4843 #if CONFIG_AC3_DECODER |
7563 | 4844 c->ac3_downmix = ff_ac3_downmix_c; |
4845 #endif | |
10429
289dd8daf4ee
add CONFIG_LPC to the build system for lpc dsputil functions. fixes build
jbr
parents:
10424
diff
changeset
|
4846 #if CONFIG_LPC |
10424
94595d0e617c
Move autocorrelation function from flacenc.c to lpc.c. Also rename the
jbr
parents:
10421
diff
changeset
|
4847 c->lpc_compute_autocorr = ff_lpc_compute_autocorr; |
10429
289dd8daf4ee
add CONFIG_LPC to the build system for lpc dsputil functions. fixes build
jbr
parents:
10424
diff
changeset
|
4848 #endif |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4849 c->vector_fmul = vector_fmul_c; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4850 c->vector_fmul_reverse = vector_fmul_reverse_c; |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4851 c->vector_fmul_add = vector_fmul_add_c; |
7261 | 4852 c->vector_fmul_window = ff_vector_fmul_window_c; |
7564 | 4853 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4854 c->vector_clipf = vector_clipf_c; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4855 c->float_to_int16 = ff_float_to_int16_c; |
7261 | 4856 c->float_to_int16_interleave = ff_float_to_int16_interleave_c; |
7203
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4857 c->scalarproduct_int16 = scalarproduct_int16_c; |
10644 | 4858 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; |
10219 | 4859 c->scalarproduct_float = scalarproduct_float_c; |
4860 c->butterflies_float = butterflies_float_c; | |
4861 c->vector_fmul_scalar = vector_fmul_scalar_c; | |
4862 | |
4863 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c; | |
4864 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c; | |
4865 | |
4866 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c; | |
4867 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c; | |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4868 |
3245 | 4869 c->shrink[0]= ff_img_copy_plane; |
4870 c->shrink[1]= ff_shrink22; | |
4871 c->shrink[2]= ff_shrink44; | |
4872 c->shrink[3]= ff_shrink88; | |
4873 | |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4874 c->prefetch= just_return; |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4875 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4876 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab)); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4877 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab)); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4878 |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4879 if (HAVE_MMX) dsputil_init_mmx (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4880 if (ARCH_ARM) dsputil_init_arm (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4881 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4882 if (HAVE_VIS) dsputil_init_vis (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4883 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4884 if (ARCH_PPC) dsputil_init_ppc (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4885 if (HAVE_MMI) dsputil_init_mmi (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4886 if (ARCH_SH4) dsputil_init_sh4 (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4887 if (ARCH_BFIN) dsputil_init_bfin (c, avctx); |
1092 | 4888 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4889 for(i=0; i<64; i++){ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4890 if(!c->put_2tap_qpel_pixels_tab[0][i]) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4891 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4892 if(!c->avg_2tap_qpel_pixels_tab[0][i]) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4893 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4894 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4895 |
1092 | 4896 switch(c->idct_permutation_type){ |
4897 case FF_NO_IDCT_PERM: | |
4898 for(i=0; i<64; i++) | |
4899 c->idct_permutation[i]= i; | |
4900 break; | |
4901 case FF_LIBMPEG2_IDCT_PERM: | |
4902 for(i=0; i<64; i++) | |
4903 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); | |
4904 break; | |
4905 case FF_SIMPLE_IDCT_PERM: | |
4906 for(i=0; i<64; i++) | |
4907 c->idct_permutation[i]= simple_mmx_permutation[i]; | |
4908 break; | |
4909 case FF_TRANSPOSE_IDCT_PERM: | |
4910 for(i=0; i<64; i++) | |
4911 c->idct_permutation[i]= ((i&7)<<3) | (i>>3); | |
4912 break; | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4913 case FF_PARTTRANS_IDCT_PERM: |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4914 for(i=0; i<64; i++) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4915 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4916 break; |
6600
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4917 case FF_SSE2_IDCT_PERM: |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4918 for(i=0; i<64; i++) |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4919 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7]; |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4920 break; |
1092 | 4921 default: |
1598
932d306bf1dc
av_log() patch by (Michel Bardiaux <mbardiaux at peaktime dot be>)
michael
parents:
1571
diff
changeset
|
4922 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n"); |
1092 | 4923 } |
0 | 4924 } |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
220
diff
changeset
|
4925 |