Mercurial > libavcodec.hg
annotate dsputil.c @ 11032:01bd040f8607 libavcodec
Unroll main loop so the edge==0 case is seperate.
This allows many things to be simplified away.
h264 decoder is overall 1% faster with a mbaff sample and
0.1% slower with the cathedral sample, probably because the slow loop
filter code must be loaded into the code cache for each first MB of each
row but isnt used for the following MBs.
author | michael |
---|---|
date | Thu, 28 Jan 2010 01:24:25 +0000 |
parents | d27deb92257b |
children | e5ebf3a17d9d |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils | |
8629
04423b2f6e0b
cosmetics: Remove pointless period after copyright statement non-sentences.
diego
parents:
8627
diff
changeset
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
0 | 5 * |
5214 | 6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> |
7 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
8 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
9 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
10 * FFmpeg is free software; you can redistribute it and/or |
429 | 11 * modify it under the terms of the GNU Lesser General Public |
12 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
13 * version 2.1 of the License, or (at your option) any later version. |
0 | 14 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
15 * FFmpeg is distributed in the hope that it will be useful, |
0 | 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 * Lesser General Public License for more details. | |
0 | 19 * |
429 | 20 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
21 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
3029
diff
changeset
|
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
0 | 23 */ |
2967 | 24 |
1106 | 25 /** |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
8629
diff
changeset
|
26 * @file libavcodec/dsputil.c |
1106 | 27 * DSP utils |
28 */ | |
2967 | 29 |
0 | 30 #include "avcodec.h" |
31 #include "dsputil.h" | |
1092 | 32 #include "simple_idct.h" |
1557 | 33 #include "faandct.h" |
6407 | 34 #include "faanidct.h" |
8627
d6bab465b82c
moves mid_pred() into mathops.h (with arch specific code split by directory)
aurel
parents:
8596
diff
changeset
|
35 #include "mathops.h" |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
36 #include "snow.h" |
10748
36611425fedb
Add required header #includes for mpegvideo.h and config.h.
diego
parents:
10644
diff
changeset
|
37 #include "mpegvideo.h" |
36611425fedb
Add required header #includes for mpegvideo.h and config.h.
diego
parents:
10644
diff
changeset
|
38 #include "config.h" |
676 | 39 |
2522
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
40 /* snow.c */ |
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
41 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count); |
e25782262d7d
kill warnings patch by (Mns Rullgrd <mru inprovide com>)
michael
parents:
2448
diff
changeset
|
42 |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
43 /* vorbis.c */ |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
44 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize); |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
45 |
7563 | 46 /* ac3dec.c */ |
47 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len); | |
48 | |
10424
94595d0e617c
Move autocorrelation function from flacenc.c to lpc.c. Also rename the
jbr
parents:
10421
diff
changeset
|
49 /* lpc.c */ |
94595d0e617c
Move autocorrelation function from flacenc.c to lpc.c. Also rename the
jbr
parents:
10421
diff
changeset
|
50 void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc); |
5737 | 51 |
6384 | 52 /* pngdec.c */ |
53 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp); | |
54 | |
8120 | 55 /* eaidct.c */ |
56 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block); | |
57 | |
4176 | 58 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; |
4179 | 59 uint32_t ff_squareTbl[512] = {0, }; |
0 | 60 |
6387 | 61 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size |
62 #define pb_7f (~0UL/255 * 0x7f) | |
63 #define pb_80 (~0UL/255 * 0x80) | |
6385 | 64 |
1064 | 65 const uint8_t ff_zigzag_direct[64] = { |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
66 0, 1, 8, 16, 9, 2, 3, 10, |
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
67 17, 24, 32, 25, 18, 11, 4, 5, |
34 | 68 12, 19, 26, 33, 40, 48, 41, 34, |
706
e65798d228ea
idct permutation cleanup, idct can be selected per context now
michaelni
parents:
689
diff
changeset
|
69 27, 20, 13, 6, 7, 14, 21, 28, |
34 | 70 35, 42, 49, 56, 57, 50, 43, 36, |
71 29, 22, 15, 23, 30, 37, 44, 51, | |
72 58, 59, 52, 45, 38, 31, 39, 46, | |
73 53, 60, 61, 54, 47, 55, 62, 63 | |
74 }; | |
75 | |
1567 | 76 /* Specific zigzag scan for 248 idct. NOTE that unlike the |
77 specification, we interleave the fields */ | |
78 const uint8_t ff_zigzag248_direct[64] = { | |
79 0, 8, 1, 9, 16, 24, 2, 10, | |
80 17, 25, 32, 40, 48, 56, 33, 41, | |
81 18, 26, 3, 11, 4, 12, 19, 27, | |
82 34, 42, 49, 57, 50, 58, 35, 43, | |
83 20, 28, 5, 13, 6, 14, 21, 29, | |
84 36, 44, 51, 59, 52, 60, 37, 45, | |
85 22, 30, 7, 15, 23, 31, 38, 46, | |
86 53, 61, 54, 62, 39, 47, 55, 63, | |
87 }; | |
88 | |
220 | 89 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */ |
10965
d27deb92257b
The SSSE3 version of dct_quantize in mpegvideo_mmx_template.c needs
reimar
parents:
10961
diff
changeset
|
90 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16)[64]; |
220 | 91 |
1064 | 92 const uint8_t ff_alternate_horizontal_scan[64] = { |
2967 | 93 0, 1, 2, 3, 8, 9, 16, 17, |
34 | 94 10, 11, 4, 5, 6, 7, 15, 14, |
2967 | 95 13, 12, 19, 18, 24, 25, 32, 33, |
34 | 96 26, 27, 20, 21, 22, 23, 28, 29, |
2967 | 97 30, 31, 34, 35, 40, 41, 48, 49, |
34 | 98 42, 43, 36, 37, 38, 39, 44, 45, |
2967 | 99 46, 47, 50, 51, 56, 57, 58, 59, |
34 | 100 52, 53, 54, 55, 60, 61, 62, 63, |
101 }; | |
102 | |
1064 | 103 const uint8_t ff_alternate_vertical_scan[64] = { |
2967 | 104 0, 8, 16, 24, 1, 9, 2, 10, |
34 | 105 17, 25, 32, 40, 48, 56, 57, 49, |
2967 | 106 41, 33, 26, 18, 3, 11, 4, 12, |
34 | 107 19, 27, 34, 42, 50, 58, 35, 43, |
2967 | 108 51, 59, 20, 28, 5, 13, 6, 14, |
34 | 109 21, 29, 36, 44, 52, 60, 37, 45, |
2967 | 110 53, 61, 22, 30, 7, 15, 23, 31, |
34 | 111 38, 46, 54, 62, 39, 47, 55, 63, |
112 }; | |
113 | |
10207 | 114 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256 |
115 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */ | |
116 const uint32_t ff_inverse[257]={ | |
2967 | 117 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757, |
118 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154, | |
119 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709, | |
120 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333, | |
121 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367, | |
122 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283, | |
123 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315, | |
124 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085, | |
125 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498, | |
126 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675, | |
127 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441, | |
128 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183, | |
129 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712, | |
130 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400, | |
131 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163, | |
132 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641, | |
133 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573, | |
134 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737, | |
135 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493, | |
136 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373, | |
137 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368, | |
138 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671, | |
139 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767, | |
140 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740, | |
141 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751, | |
142 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635, | |
143 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593, | |
144 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944, | |
145 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933, | |
146 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575, | |
147 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532, | |
220 | 148 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010, |
10207 | 149 16777216 |
220 | 150 }; |
151 | |
1092 | 152 /* Input permutation for the simple_idct_mmx */ |
153 static const uint8_t simple_mmx_permutation[64]={ | |
2979 | 154 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, |
155 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, | |
156 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, | |
157 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, | |
158 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, | |
159 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, | |
160 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, | |
161 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, | |
1092 | 162 }; |
163 | |
6600
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
164 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7}; |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
165 |
6438 | 166 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){ |
167 int i; | |
168 int end; | |
169 | |
170 st->scantable= src_scantable; | |
171 | |
172 for(i=0; i<64; i++){ | |
173 int j; | |
174 j = src_scantable[i]; | |
175 st->permutated[i] = permutation[j]; | |
8590 | 176 #if ARCH_PPC |
6438 | 177 st->inverse[j] = i; |
178 #endif | |
179 } | |
180 | |
181 end=-1; | |
182 for(i=0; i<64; i++){ | |
183 int j; | |
184 j = st->permutated[i]; | |
185 if(j>end) end=j; | |
186 st->raster_end[i]= end; | |
187 } | |
188 } | |
189 | |
1064 | 190 static int pix_sum_c(uint8_t * pix, int line_size) |
612 | 191 { |
192 int s, i, j; | |
193 | |
194 s = 0; | |
195 for (i = 0; i < 16; i++) { | |
2979 | 196 for (j = 0; j < 16; j += 8) { |
197 s += pix[0]; | |
198 s += pix[1]; | |
199 s += pix[2]; | |
200 s += pix[3]; | |
201 s += pix[4]; | |
202 s += pix[5]; | |
203 s += pix[6]; | |
204 s += pix[7]; | |
205 pix += 8; | |
206 } | |
207 pix += line_size - 16; | |
612 | 208 } |
209 return s; | |
210 } | |
211 | |
1064 | 212 static int pix_norm1_c(uint8_t * pix, int line_size) |
612 | 213 { |
214 int s, i, j; | |
4179 | 215 uint32_t *sq = ff_squareTbl + 256; |
612 | 216 |
217 s = 0; | |
218 for (i = 0; i < 16; i++) { | |
2979 | 219 for (j = 0; j < 16; j += 8) { |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
220 #if 0 |
2979 | 221 s += sq[pix[0]]; |
222 s += sq[pix[1]]; | |
223 s += sq[pix[2]]; | |
224 s += sq[pix[3]]; | |
225 s += sq[pix[4]]; | |
226 s += sq[pix[5]]; | |
227 s += sq[pix[6]]; | |
228 s += sq[pix[7]]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
229 #else |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
230 #if LONG_MAX > 2147483647 |
2979 | 231 register uint64_t x=*(uint64_t*)pix; |
232 s += sq[x&0xff]; | |
233 s += sq[(x>>8)&0xff]; | |
234 s += sq[(x>>16)&0xff]; | |
235 s += sq[(x>>24)&0xff]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
236 s += sq[(x>>32)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
237 s += sq[(x>>40)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
238 s += sq[(x>>48)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
239 s += sq[(x>>56)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
240 #else |
2979 | 241 register uint32_t x=*(uint32_t*)pix; |
242 s += sq[x&0xff]; | |
243 s += sq[(x>>8)&0xff]; | |
244 s += sq[(x>>16)&0xff]; | |
245 s += sq[(x>>24)&0xff]; | |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
246 x=*(uint32_t*)(pix+4); |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
247 s += sq[x&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
248 s += sq[(x>>8)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
249 s += sq[(x>>16)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
250 s += sq[(x>>24)&0xff]; |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
251 #endif |
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
252 #endif |
2979 | 253 pix += 8; |
254 } | |
255 pix += line_size - 16; | |
612 | 256 } |
257 return s; | |
258 } | |
259 | |
6241 | 260 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){ |
1273 | 261 int i; |
2967 | 262 |
1273 | 263 for(i=0; i+8<=w; i+=8){ |
264 dst[i+0]= bswap_32(src[i+0]); | |
265 dst[i+1]= bswap_32(src[i+1]); | |
266 dst[i+2]= bswap_32(src[i+2]); | |
267 dst[i+3]= bswap_32(src[i+3]); | |
268 dst[i+4]= bswap_32(src[i+4]); | |
269 dst[i+5]= bswap_32(src[i+5]); | |
270 dst[i+6]= bswap_32(src[i+6]); | |
271 dst[i+7]= bswap_32(src[i+7]); | |
272 } | |
273 for(;i<w; i++){ | |
274 dst[i+0]= bswap_32(src[i+0]); | |
275 } | |
276 } | |
612 | 277 |
2184 | 278 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
279 { | |
280 int s, i; | |
4179 | 281 uint32_t *sq = ff_squareTbl + 256; |
2184 | 282 |
283 s = 0; | |
284 for (i = 0; i < h; i++) { | |
285 s += sq[pix1[0] - pix2[0]]; | |
286 s += sq[pix1[1] - pix2[1]]; | |
287 s += sq[pix1[2] - pix2[2]]; | |
288 s += sq[pix1[3] - pix2[3]]; | |
289 pix1 += line_size; | |
290 pix2 += line_size; | |
291 } | |
292 return s; | |
293 } | |
294 | |
1708 | 295 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
936 | 296 { |
297 int s, i; | |
4179 | 298 uint32_t *sq = ff_squareTbl + 256; |
936 | 299 |
300 s = 0; | |
1708 | 301 for (i = 0; i < h; i++) { |
936 | 302 s += sq[pix1[0] - pix2[0]]; |
303 s += sq[pix1[1] - pix2[1]]; | |
304 s += sq[pix1[2] - pix2[2]]; | |
305 s += sq[pix1[3] - pix2[3]]; | |
306 s += sq[pix1[4] - pix2[4]]; | |
307 s += sq[pix1[5] - pix2[5]]; | |
308 s += sq[pix1[6] - pix2[6]]; | |
309 s += sq[pix1[7] - pix2[7]]; | |
310 pix1 += line_size; | |
311 pix2 += line_size; | |
312 } | |
313 return s; | |
314 } | |
315 | |
1708 | 316 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
884 | 317 { |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
318 int s, i; |
4179 | 319 uint32_t *sq = ff_squareTbl + 256; |
884 | 320 |
321 s = 0; | |
1708 | 322 for (i = 0; i < h; i++) { |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
323 s += sq[pix1[ 0] - pix2[ 0]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
324 s += sq[pix1[ 1] - pix2[ 1]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
325 s += sq[pix1[ 2] - pix2[ 2]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
326 s += sq[pix1[ 3] - pix2[ 3]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
327 s += sq[pix1[ 4] - pix2[ 4]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
328 s += sq[pix1[ 5] - pix2[ 5]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
329 s += sq[pix1[ 6] - pix2[ 6]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
330 s += sq[pix1[ 7] - pix2[ 7]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
331 s += sq[pix1[ 8] - pix2[ 8]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
332 s += sq[pix1[ 9] - pix2[ 9]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
333 s += sq[pix1[10] - pix2[10]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
334 s += sq[pix1[11] - pix2[11]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
335 s += sq[pix1[12] - pix2[12]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
336 s += sq[pix1[13] - pix2[13]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
337 s += sq[pix1[14] - pix2[14]]; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
338 s += sq[pix1[15] - pix2[15]]; |
997
4dfe15ae0078
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
michaelni
parents:
996
diff
changeset
|
339 |
1012
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
340 pix1 += line_size; |
7a5038ec769b
sse16_c is totally fucked up (unaligned loads, LONG_MAX is undefined,
mellum
parents:
1011
diff
changeset
|
341 pix2 += line_size; |
884 | 342 } |
343 return s; | |
344 } | |
345 | |
2184 | 346 |
8590 | 347 #if CONFIG_SNOW_ENCODER //dwt is in snow.c |
2184 | 348 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){ |
349 int s, i, j; | |
350 const int dec_count= w==8 ? 3 : 4; | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
351 int tmp[32*32]; |
2184 | 352 int level, ori; |
2967 | 353 static const int scale[2][2][4][4]={ |
2184 | 354 { |
355 { | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
356 // 9/7 8x8 dec=3 |
2184 | 357 {268, 239, 239, 213}, |
358 { 0, 224, 224, 152}, | |
359 { 0, 135, 135, 110}, | |
360 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
361 // 9/7 16x16 or 32x32 dec=4 |
2184 | 362 {344, 310, 310, 280}, |
363 { 0, 320, 320, 228}, | |
364 { 0, 175, 175, 136}, | |
365 { 0, 129, 129, 102}, | |
366 } | |
367 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
368 { |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
369 // 5/3 8x8 dec=3 |
2184 | 370 {275, 245, 245, 218}, |
371 { 0, 230, 230, 156}, | |
372 { 0, 138, 138, 113}, | |
373 },{ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
374 // 5/3 16x16 or 32x32 dec=4 |
2184 | 375 {352, 317, 317, 286}, |
376 { 0, 328, 328, 233}, | |
377 { 0, 180, 180, 140}, | |
378 { 0, 132, 132, 105}, | |
379 } | |
380 } | |
381 }; | |
382 | |
383 for (i = 0; i < h; i++) { | |
384 for (j = 0; j < w; j+=4) { | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
385 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
386 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
387 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
388 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4; |
2184 | 389 } |
390 pix1 += line_size; | |
391 pix2 += line_size; | |
392 } | |
2639 | 393 |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
394 ff_spatial_dwt(tmp, w, h, 32, type, dec_count); |
2184 | 395 |
396 s=0; | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
397 assert(w==h); |
2184 | 398 for(level=0; level<dec_count; level++){ |
399 for(ori= level ? 1 : 0; ori<4; ori++){ | |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
400 int size= w>>(dec_count-level); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
401 int sx= (ori&1) ? size : 0; |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
402 int stride= 32<<(dec_count-level); |
2184 | 403 int sy= (ori&2) ? stride>>1 : 0; |
2967 | 404 |
2184 | 405 for(i=0; i<size; i++){ |
406 for(j=0; j<size; j++){ | |
407 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori]; | |
4001 | 408 s += FFABS(v); |
2184 | 409 } |
410 } | |
411 } | |
412 } | |
2967 | 413 assert(s>=0); |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
414 return s>>9; |
2184 | 415 } |
416 | |
417 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
418 return w_c(v, pix1, pix2, line_size, 8, h, 1); | |
419 } | |
420 | |
421 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
422 return w_c(v, pix1, pix2, line_size, 8, h, 0); | |
423 } | |
424 | |
425 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
426 return w_c(v, pix1, pix2, line_size, 16, h, 1); | |
427 } | |
428 | |
429 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ | |
430 return w_c(v, pix1, pix2, line_size, 16, h, 0); | |
431 } | |
432 | |
4197 | 433 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
434 return w_c(v, pix1, pix2, line_size, 32, h, 1); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
435 } |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
436 |
4197 | 437 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){ |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
438 return w_c(v, pix1, pix2, line_size, 32, h, 0); |
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
439 } |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
440 #endif |
3323
87c54a3f8d19
Snow: fix subband weighting in wavelet cmp functions. use 32x32 cmp in iterative motion estimation.
lorenm
parents:
3248
diff
changeset
|
441 |
6437 | 442 /* draw the edges of width 'w' of an image of size width, height */ |
443 //FIXME check that this is ok for mpeg4 interlaced | |
444 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w) | |
445 { | |
446 uint8_t *ptr, *last_line; | |
447 int i; | |
448 | |
449 last_line = buf + (height - 1) * wrap; | |
450 for(i=0;i<w;i++) { | |
451 /* top and bottom */ | |
452 memcpy(buf - (i + 1) * wrap, buf, width); | |
453 memcpy(last_line + (i + 1) * wrap, last_line, width); | |
454 } | |
455 /* left and right */ | |
456 ptr = buf; | |
457 for(i=0;i<height;i++) { | |
458 memset(ptr - w, ptr[0], w); | |
459 memset(ptr + width, ptr[width-1], w); | |
460 ptr += wrap; | |
461 } | |
462 /* corners */ | |
463 for(i=0;i<w;i++) { | |
464 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */ | |
465 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */ | |
466 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */ | |
467 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */ | |
468 } | |
469 } | |
470 | |
6445 | 471 /** |
472 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples. | |
473 * @param buf destination buffer | |
474 * @param src source buffer | |
475 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers | |
476 * @param block_w width of block | |
477 * @param block_h height of block | |
478 * @param src_x x coordinate of the top left sample of the block in the source buffer | |
479 * @param src_y y coordinate of the top left sample of the block in the source buffer | |
480 * @param w width of the source buffer | |
481 * @param h height of the source buffer | |
482 */ | |
483 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h, | |
484 int src_x, int src_y, int w, int h){ | |
485 int x, y; | |
486 int start_y, start_x, end_y, end_x; | |
487 | |
488 if(src_y>= h){ | |
489 src+= (h-1-src_y)*linesize; | |
490 src_y=h-1; | |
491 }else if(src_y<=-block_h){ | |
492 src+= (1-block_h-src_y)*linesize; | |
493 src_y=1-block_h; | |
494 } | |
495 if(src_x>= w){ | |
496 src+= (w-1-src_x); | |
497 src_x=w-1; | |
498 }else if(src_x<=-block_w){ | |
499 src+= (1-block_w-src_x); | |
500 src_x=1-block_w; | |
501 } | |
502 | |
503 start_y= FFMAX(0, -src_y); | |
504 start_x= FFMAX(0, -src_x); | |
505 end_y= FFMIN(block_h, h-src_y); | |
506 end_x= FFMIN(block_w, w-src_x); | |
507 | |
508 // copy existing part | |
509 for(y=start_y; y<end_y; y++){ | |
510 for(x=start_x; x<end_x; x++){ | |
511 buf[x + y*linesize]= src[x + y*linesize]; | |
512 } | |
513 } | |
514 | |
515 //top | |
516 for(y=0; y<start_y; y++){ | |
517 for(x=start_x; x<end_x; x++){ | |
518 buf[x + y*linesize]= buf[x + start_y*linesize]; | |
519 } | |
520 } | |
521 | |
522 //bottom | |
523 for(y=end_y; y<block_h; y++){ | |
524 for(x=start_x; x<end_x; x++){ | |
525 buf[x + y*linesize]= buf[x + (end_y-1)*linesize]; | |
526 } | |
527 } | |
528 | |
529 for(y=0; y<block_h; y++){ | |
530 //left | |
531 for(x=0; x<start_x; x++){ | |
532 buf[x + y*linesize]= buf[start_x + y*linesize]; | |
533 } | |
534 | |
535 //right | |
536 for(x=end_x; x<block_w; x++){ | |
537 buf[x + y*linesize]= buf[end_x - 1 + y*linesize]; | |
538 } | |
539 } | |
540 } | |
541 | |
1064 | 542 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |
0 | 543 { |
544 int i; | |
545 | |
546 /* read the pixels */ | |
547 for(i=0;i<8;i++) { | |
516 | 548 block[0] = pixels[0]; |
549 block[1] = pixels[1]; | |
550 block[2] = pixels[2]; | |
551 block[3] = pixels[3]; | |
552 block[4] = pixels[4]; | |
553 block[5] = pixels[5]; | |
554 block[6] = pixels[6]; | |
555 block[7] = pixels[7]; | |
556 pixels += line_size; | |
557 block += 8; | |
0 | 558 } |
559 } | |
560 | |
1064 | 561 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1, |
2979 | 562 const uint8_t *s2, int stride){ |
324 | 563 int i; |
564 | |
565 /* read the pixels */ | |
566 for(i=0;i<8;i++) { | |
516 | 567 block[0] = s1[0] - s2[0]; |
568 block[1] = s1[1] - s2[1]; | |
569 block[2] = s1[2] - s2[2]; | |
570 block[3] = s1[3] - s2[3]; | |
571 block[4] = s1[4] - s2[4]; | |
572 block[5] = s1[5] - s2[5]; | |
573 block[6] = s1[6] - s2[6]; | |
574 block[7] = s1[7] - s2[7]; | |
324 | 575 s1 += stride; |
576 s2 += stride; | |
516 | 577 block += 8; |
324 | 578 } |
579 } | |
580 | |
581 | |
1064 | 582 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 583 int line_size) |
0 | 584 { |
585 int i; | |
4176 | 586 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 587 |
0 | 588 /* read the pixels */ |
589 for(i=0;i<8;i++) { | |
516 | 590 pixels[0] = cm[block[0]]; |
591 pixels[1] = cm[block[1]]; | |
592 pixels[2] = cm[block[2]]; | |
593 pixels[3] = cm[block[3]]; | |
594 pixels[4] = cm[block[4]]; | |
595 pixels[5] = cm[block[5]]; | |
596 pixels[6] = cm[block[6]]; | |
597 pixels[7] = cm[block[7]]; | |
598 | |
599 pixels += line_size; | |
600 block += 8; | |
0 | 601 } |
602 } | |
603 | |
2256 | 604 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 605 int line_size) |
2256 | 606 { |
607 int i; | |
4176 | 608 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 609 |
2256 | 610 /* read the pixels */ |
611 for(i=0;i<4;i++) { | |
612 pixels[0] = cm[block[0]]; | |
613 pixels[1] = cm[block[1]]; | |
614 pixels[2] = cm[block[2]]; | |
615 pixels[3] = cm[block[3]]; | |
616 | |
617 pixels += line_size; | |
618 block += 8; | |
619 } | |
620 } | |
621 | |
2257 | 622 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels, |
2979 | 623 int line_size) |
2257 | 624 { |
625 int i; | |
4176 | 626 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 627 |
2257 | 628 /* read the pixels */ |
629 for(i=0;i<2;i++) { | |
630 pixels[0] = cm[block[0]]; | |
631 pixels[1] = cm[block[1]]; | |
632 | |
633 pixels += line_size; | |
634 block += 8; | |
635 } | |
636 } | |
637 | |
2967 | 638 static void put_signed_pixels_clamped_c(const DCTELEM *block, |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
639 uint8_t *restrict pixels, |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
640 int line_size) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
641 { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
642 int i, j; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
643 |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
644 for (i = 0; i < 8; i++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
645 for (j = 0; j < 8; j++) { |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
646 if (*block < -128) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
647 *pixels = 0; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
648 else if (*block > 127) |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
649 *pixels = 255; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
650 else |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
651 *pixels = (uint8_t)(*block + 128); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
652 block++; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
653 pixels++; |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
654 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
655 pixels += (line_size - 8); |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
656 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
657 } |
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
658 |
1064 | 659 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, |
516 | 660 int line_size) |
0 | 661 { |
662 int i; | |
4176 | 663 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 664 |
0 | 665 /* read the pixels */ |
666 for(i=0;i<8;i++) { | |
516 | 667 pixels[0] = cm[pixels[0] + block[0]]; |
668 pixels[1] = cm[pixels[1] + block[1]]; | |
669 pixels[2] = cm[pixels[2] + block[2]]; | |
670 pixels[3] = cm[pixels[3] + block[3]]; | |
671 pixels[4] = cm[pixels[4] + block[4]]; | |
672 pixels[5] = cm[pixels[5] + block[5]]; | |
673 pixels[6] = cm[pixels[6] + block[6]]; | |
674 pixels[7] = cm[pixels[7] + block[7]]; | |
675 pixels += line_size; | |
676 block += 8; | |
0 | 677 } |
678 } | |
2256 | 679 |
680 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels, | |
681 int line_size) | |
682 { | |
683 int i; | |
4176 | 684 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 685 |
2256 | 686 /* read the pixels */ |
687 for(i=0;i<4;i++) { | |
688 pixels[0] = cm[pixels[0] + block[0]]; | |
689 pixels[1] = cm[pixels[1] + block[1]]; | |
690 pixels[2] = cm[pixels[2] + block[2]]; | |
691 pixels[3] = cm[pixels[3] + block[3]]; | |
692 pixels += line_size; | |
693 block += 8; | |
694 } | |
695 } | |
2257 | 696 |
697 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels, | |
698 int line_size) | |
699 { | |
700 int i; | |
4176 | 701 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2967 | 702 |
2257 | 703 /* read the pixels */ |
704 for(i=0;i<2;i++) { | |
705 pixels[0] = cm[pixels[0] + block[0]]; | |
706 pixels[1] = cm[pixels[1] + block[1]]; | |
707 pixels += line_size; | |
708 block += 8; | |
709 } | |
710 } | |
2763 | 711 |
712 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size) | |
713 { | |
714 int i; | |
715 for(i=0;i<8;i++) { | |
716 pixels[0] += block[0]; | |
717 pixels[1] += block[1]; | |
718 pixels[2] += block[2]; | |
719 pixels[3] += block[3]; | |
720 pixels[4] += block[4]; | |
721 pixels[5] += block[5]; | |
722 pixels[6] += block[6]; | |
723 pixels[7] += block[7]; | |
724 pixels += line_size; | |
725 block += 8; | |
726 } | |
727 } | |
728 | |
729 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size) | |
730 { | |
731 int i; | |
732 for(i=0;i<4;i++) { | |
733 pixels[0] += block[0]; | |
734 pixels[1] += block[1]; | |
735 pixels[2] += block[2]; | |
736 pixels[3] += block[3]; | |
737 pixels += line_size; | |
738 block += 4; | |
739 } | |
740 } | |
741 | |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
742 static int sum_abs_dctelem_c(DCTELEM *block) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
743 { |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
744 int sum=0, i; |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
745 for(i=0; i<64; i++) |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
746 sum+= FFABS(block[i]); |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
747 return sum; |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
748 } |
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
749 |
385 | 750 #if 0 |
751 | |
752 #define PIXOP2(OPNAME, OP) \ | |
651 | 753 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 754 {\ |
755 int i;\ | |
756 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
757 OP(*((uint64_t*)block), AV_RN64(pixels));\ |
385 | 758 pixels+=line_size;\ |
759 block +=line_size;\ | |
760 }\ | |
761 }\ | |
762 \ | |
859 | 763 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 764 {\ |
765 int i;\ | |
766 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
767 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
768 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 769 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
770 pixels+=line_size;\ | |
771 block +=line_size;\ | |
772 }\ | |
773 }\ | |
774 \ | |
859 | 775 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 776 {\ |
777 int i;\ | |
778 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
779 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
780 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 781 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
782 pixels+=line_size;\ | |
783 block +=line_size;\ | |
784 }\ | |
785 }\ | |
786 \ | |
859 | 787 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 788 {\ |
789 int i;\ | |
790 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
791 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
792 const uint64_t b= AV_RN64(pixels+line_size);\ |
385 | 793 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
794 pixels+=line_size;\ | |
795 block +=line_size;\ | |
796 }\ | |
797 }\ | |
798 \ | |
859 | 799 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 800 {\ |
801 int i;\ | |
802 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
803 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
804 const uint64_t b= AV_RN64(pixels+line_size);\ |
385 | 805 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ |
806 pixels+=line_size;\ | |
807 block +=line_size;\ | |
808 }\ | |
809 }\ | |
810 \ | |
859 | 811 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 812 {\ |
813 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
814 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
815 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 816 uint64_t l0= (a&0x0303030303030303ULL)\ |
817 + (b&0x0303030303030303ULL)\ | |
818 + 0x0202020202020202ULL;\ | |
819 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
820 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
821 uint64_t l1,h1;\ | |
822 \ | |
823 pixels+=line_size;\ | |
824 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
825 uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
826 uint64_t b= AV_RN64(pixels+1);\ |
385 | 827 l1= (a&0x0303030303030303ULL)\ |
828 + (b&0x0303030303030303ULL);\ | |
829 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
830 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
831 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
832 pixels+=line_size;\ | |
833 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
834 a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
835 b= AV_RN64(pixels+1);\ |
385 | 836 l0= (a&0x0303030303030303ULL)\ |
837 + (b&0x0303030303030303ULL)\ | |
838 + 0x0202020202020202ULL;\ | |
839 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
840 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
841 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
842 pixels+=line_size;\ | |
843 block +=line_size;\ | |
844 }\ | |
845 }\ | |
846 \ | |
859 | 847 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 848 {\ |
849 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
850 const uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
851 const uint64_t b= AV_RN64(pixels+1);\ |
385 | 852 uint64_t l0= (a&0x0303030303030303ULL)\ |
853 + (b&0x0303030303030303ULL)\ | |
854 + 0x0101010101010101ULL;\ | |
855 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
856 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
857 uint64_t l1,h1;\ | |
858 \ | |
859 pixels+=line_size;\ | |
860 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
861 uint64_t a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
862 uint64_t b= AV_RN64(pixels+1);\ |
385 | 863 l1= (a&0x0303030303030303ULL)\ |
864 + (b&0x0303030303030303ULL);\ | |
865 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
866 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
867 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
868 pixels+=line_size;\ | |
869 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
870 a= AV_RN64(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
871 b= AV_RN64(pixels+1);\ |
385 | 872 l0= (a&0x0303030303030303ULL)\ |
873 + (b&0x0303030303030303ULL)\ | |
874 + 0x0101010101010101ULL;\ | |
875 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
876 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
877 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
878 pixels+=line_size;\ | |
879 block +=line_size;\ | |
880 }\ | |
881 }\ | |
882 \ | |
859 | 883 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\ |
884 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\ | |
885 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\ | |
886 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\ | |
887 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\ | |
888 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\ | |
889 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8) | |
385 | 890 |
891 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) ) | |
892 #else // 64 bit variant | |
893 | |
894 #define PIXOP2(OPNAME, OP) \ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
895 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
896 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
897 for(i=0; i<h; i++){\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
898 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
899 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
900 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
901 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
902 }\ |
1168 | 903 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
904 int i;\ | |
905 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
906 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ |
1168 | 907 pixels+=line_size;\ |
908 block +=line_size;\ | |
909 }\ | |
910 }\ | |
859 | 911 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
385 | 912 int i;\ |
913 for(i=0; i<h; i++){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
914 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
915 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\ |
385 | 916 pixels+=line_size;\ |
917 block +=line_size;\ | |
918 }\ | |
919 }\ | |
859 | 920 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
921 OPNAME ## _pixels8_c(block, pixels, line_size, h);\ | |
651 | 922 }\ |
385 | 923 \ |
651 | 924 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
925 int src_stride1, int src_stride2, int h){\ | |
385 | 926 int i;\ |
927 for(i=0; i<h; i++){\ | |
651 | 928 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
929 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
930 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 931 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
932 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
933 b= AV_RN32(&src2[i*src_stride2+4]);\ |
1264 | 934 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\ |
385 | 935 }\ |
936 }\ | |
937 \ | |
651 | 938 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
939 int src_stride1, int src_stride2, int h){\ | |
385 | 940 int i;\ |
941 for(i=0; i<h; i++){\ | |
651 | 942 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
943 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
944 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 945 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
946 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
947 b= AV_RN32(&src2[i*src_stride2+4]);\ |
1264 | 948 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ |
385 | 949 }\ |
950 }\ | |
951 \ | |
1168 | 952 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
953 int src_stride1, int src_stride2, int h){\ | |
954 int i;\ | |
955 for(i=0; i<h; i++){\ | |
956 uint32_t a,b;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
957 a= AV_RN32(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
958 b= AV_RN32(&src2[i*src_stride2 ]);\ |
1264 | 959 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
1168 | 960 }\ |
961 }\ | |
962 \ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
963 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
964 int src_stride1, int src_stride2, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
965 int i;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
966 for(i=0; i<h; i++){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
967 uint32_t a,b;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
968 a= AV_RN16(&src1[i*src_stride1 ]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
969 b= AV_RN16(&src2[i*src_stride2 ]);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
970 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
971 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
972 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
973 \ |
651 | 974 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ |
975 int src_stride1, int src_stride2, int h){\ | |
976 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ | |
977 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ | |
978 }\ | |
979 \ | |
980 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ | |
981 int src_stride1, int src_stride2, int h){\ | |
982 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ | |
983 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ | |
984 }\ | |
985 \ | |
859 | 986 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 987 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
988 }\ | |
989 \ | |
859 | 990 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 991 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
992 }\ | |
993 \ | |
859 | 994 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 995 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
996 }\ | |
997 \ | |
859 | 998 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
651 | 999 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
385 | 1000 }\ |
1001 \ | |
651 | 1002 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1003 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1004 int i;\ | |
1005 for(i=0; i<h; i++){\ | |
1006 uint32_t a, b, c, d, l0, l1, h0, h1;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1007 a= AV_RN32(&src1[i*src_stride1]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1008 b= AV_RN32(&src2[i*src_stride2]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1009 c= AV_RN32(&src3[i*src_stride3]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1010 d= AV_RN32(&src4[i*src_stride4]);\ |
651 | 1011 l0= (a&0x03030303UL)\ |
1012 + (b&0x03030303UL)\ | |
1013 + 0x02020202UL;\ | |
1014 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1015 + ((b&0xFCFCFCFCUL)>>2);\ | |
1016 l1= (c&0x03030303UL)\ | |
1017 + (d&0x03030303UL);\ | |
1018 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1019 + ((d&0xFCFCFCFCUL)>>2);\ | |
1020 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1021 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1022 b= AV_RN32(&src2[i*src_stride2+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1023 c= AV_RN32(&src3[i*src_stride3+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1024 d= AV_RN32(&src4[i*src_stride4+4]);\ |
651 | 1025 l0= (a&0x03030303UL)\ |
1026 + (b&0x03030303UL)\ | |
1027 + 0x02020202UL;\ | |
1028 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1029 + ((b&0xFCFCFCFCUL)>>2);\ | |
1030 l1= (c&0x03030303UL)\ | |
1031 + (d&0x03030303UL);\ | |
1032 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1033 + ((d&0xFCFCFCFCUL)>>2);\ | |
1034 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1035 }\ | |
1036 }\ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1037 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1038 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1039 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1040 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1041 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1042 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1043 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1044 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1045 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1046 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1047 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1048 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1049 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1050 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1051 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1052 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1053 \ |
651 | 1054 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1055 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
385 | 1056 int i;\ |
1057 for(i=0; i<h; i++){\ | |
651 | 1058 uint32_t a, b, c, d, l0, l1, h0, h1;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1059 a= AV_RN32(&src1[i*src_stride1]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1060 b= AV_RN32(&src2[i*src_stride2]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1061 c= AV_RN32(&src3[i*src_stride3]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1062 d= AV_RN32(&src4[i*src_stride4]);\ |
651 | 1063 l0= (a&0x03030303UL)\ |
1064 + (b&0x03030303UL)\ | |
1065 + 0x01010101UL;\ | |
1066 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1067 + ((b&0xFCFCFCFCUL)>>2);\ | |
1068 l1= (c&0x03030303UL)\ | |
1069 + (d&0x03030303UL);\ | |
1070 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1071 + ((d&0xFCFCFCFCUL)>>2);\ | |
1072 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1073 a= AV_RN32(&src1[i*src_stride1+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1074 b= AV_RN32(&src2[i*src_stride2+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1075 c= AV_RN32(&src3[i*src_stride3+4]);\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1076 d= AV_RN32(&src4[i*src_stride4+4]);\ |
651 | 1077 l0= (a&0x03030303UL)\ |
1078 + (b&0x03030303UL)\ | |
1079 + 0x01010101UL;\ | |
1080 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1081 + ((b&0xFCFCFCFCUL)>>2);\ | |
1082 l1= (c&0x03030303UL)\ | |
1083 + (d&0x03030303UL);\ | |
1084 h1= ((c&0xFCFCFCFCUL)>>2)\ | |
1085 + ((d&0xFCFCFCFCUL)>>2);\ | |
1086 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
385 | 1087 }\ |
1088 }\ | |
651 | 1089 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ |
1090 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1091 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1092 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1093 }\ | |
1094 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\ | |
1095 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | |
1096 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1097 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\ | |
1098 }\ | |
385 | 1099 \ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1100 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1101 {\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1102 int i, a0, b0, a1, b1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1103 a0= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1104 b0= pixels[1] + 2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1105 a0 += b0;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1106 b0 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1107 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1108 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1109 for(i=0; i<h; i+=2){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1110 a1= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1111 b1= pixels[1];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1112 a1 += b1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1113 b1 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1114 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1115 block[0]= (a1+a0)>>2; /* FIXME non put */\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1116 block[1]= (b1+b0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1117 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1118 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1119 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1120 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1121 a0= pixels[0];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1122 b0= pixels[1] + 2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1123 a0 += b0;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1124 b0 += pixels[2];\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1125 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1126 block[0]= (a1+a0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1127 block[1]= (b1+b0)>>2;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1128 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1129 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1130 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1131 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1132 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1133 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1134 {\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1135 int i;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1136 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1137 const uint32_t b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1138 uint32_t l0= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1139 + (b&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1140 + 0x02020202UL;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1141 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1142 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1143 uint32_t l1,h1;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1144 \ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1145 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1146 for(i=0; i<h; i+=2){\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1147 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1148 uint32_t b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1149 l1= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1150 + (b&0x03030303UL);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1151 h1= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1152 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1153 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1154 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1155 block +=line_size;\ |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1156 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1157 b= AV_RN32(pixels+1);\ |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1158 l0= (a&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1159 + (b&0x03030303UL)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1160 + 0x02020202UL;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1161 h0= ((a&0xFCFCFCFCUL)>>2)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1162 + ((b&0xFCFCFCFCUL)>>2);\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1163 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1164 pixels+=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1165 block +=line_size;\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1166 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1167 }\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1168 \ |
859 | 1169 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 1170 {\ |
1171 int j;\ | |
1172 for(j=0; j<2; j++){\ | |
1173 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1174 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1175 const uint32_t b= AV_RN32(pixels+1);\ |
385 | 1176 uint32_t l0= (a&0x03030303UL)\ |
1177 + (b&0x03030303UL)\ | |
1178 + 0x02020202UL;\ | |
1179 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1180 + ((b&0xFCFCFCFCUL)>>2);\ | |
1181 uint32_t l1,h1;\ | |
1182 \ | |
1183 pixels+=line_size;\ | |
1184 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1185 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1186 uint32_t b= AV_RN32(pixels+1);\ |
385 | 1187 l1= (a&0x03030303UL)\ |
1188 + (b&0x03030303UL);\ | |
1189 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
1190 + ((b&0xFCFCFCFCUL)>>2);\ | |
1191 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1192 pixels+=line_size;\ | |
1193 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1194 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1195 b= AV_RN32(pixels+1);\ |
385 | 1196 l0= (a&0x03030303UL)\ |
1197 + (b&0x03030303UL)\ | |
1198 + 0x02020202UL;\ | |
1199 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1200 + ((b&0xFCFCFCFCUL)>>2);\ | |
1201 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1202 pixels+=line_size;\ | |
1203 block +=line_size;\ | |
1204 }\ | |
1205 pixels+=4-line_size*(h+1);\ | |
1206 block +=4-line_size*h;\ | |
1207 }\ | |
1208 }\ | |
1209 \ | |
859 | 1210 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ |
385 | 1211 {\ |
1212 int j;\ | |
1213 for(j=0; j<2; j++){\ | |
1214 int i;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1215 const uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1216 const uint32_t b= AV_RN32(pixels+1);\ |
385 | 1217 uint32_t l0= (a&0x03030303UL)\ |
1218 + (b&0x03030303UL)\ | |
1219 + 0x01010101UL;\ | |
1220 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1221 + ((b&0xFCFCFCFCUL)>>2);\ | |
1222 uint32_t l1,h1;\ | |
1223 \ | |
1224 pixels+=line_size;\ | |
1225 for(i=0; i<h; i+=2){\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1226 uint32_t a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1227 uint32_t b= AV_RN32(pixels+1);\ |
385 | 1228 l1= (a&0x03030303UL)\ |
1229 + (b&0x03030303UL);\ | |
1230 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
1231 + ((b&0xFCFCFCFCUL)>>2);\ | |
1232 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1233 pixels+=line_size;\ | |
1234 block +=line_size;\ | |
5520
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1235 a= AV_RN32(pixels );\ |
c16a59ef6a86
* renaming (ST|LD)(16|32|64) -> AV_(R|W)N(16|32|64)
romansh
parents:
5411
diff
changeset
|
1236 b= AV_RN32(pixels+1);\ |
385 | 1237 l0= (a&0x03030303UL)\ |
1238 + (b&0x03030303UL)\ | |
1239 + 0x01010101UL;\ | |
1240 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
1241 + ((b&0xFCFCFCFCUL)>>2);\ | |
1242 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
1243 pixels+=line_size;\ | |
1244 block +=line_size;\ | |
1245 }\ | |
1246 pixels+=4-line_size*(h+1);\ | |
1247 block +=4-line_size*h;\ | |
1248 }\ | |
1249 }\ | |
1250 \ | |
859 | 1251 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ |
1252 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\ | |
1253 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\ | |
1254 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\ | |
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\ | |
1256 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\ | |
1257 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\ | |
1258 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\ | |
651 | 1259 |
1264 | 1260 #define op_avg(a, b) a = rnd_avg32(a, b) |
385 | 1261 #endif |
1262 #define op_put(a, b) a = b | |
1263 | |
1264 PIXOP2(avg, op_avg) | |
1265 PIXOP2(put, op_put) | |
1266 #undef op_avg | |
1267 #undef op_put | |
1268 | |
0 | 1269 #define avg2(a,b) ((a+b+1)>>1) |
1270 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) | |
1271 | |
1864 | 1272 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){ |
1273 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h); | |
1274 } | |
1275 | |
1276 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){ | |
1277 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h); | |
1278 } | |
753 | 1279 |
1064 | 1280 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder) |
255 | 1281 { |
1282 const int A=(16-x16)*(16-y16); | |
1283 const int B=( x16)*(16-y16); | |
1284 const int C=(16-x16)*( y16); | |
1285 const int D=( x16)*( y16); | |
1286 int i; | |
1287 | |
1288 for(i=0; i<h; i++) | |
1289 { | |
651 | 1290 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8; |
1291 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8; | |
1292 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8; | |
1293 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8; | |
1294 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8; | |
1295 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8; | |
1296 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8; | |
1297 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8; | |
1298 dst+= stride; | |
1299 src+= stride; | |
255 | 1300 } |
1301 } | |
1302 | |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
1303 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
753 | 1304 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) |
1305 { | |
1306 int y, vx, vy; | |
1307 const int s= 1<<shift; | |
2967 | 1308 |
753 | 1309 width--; |
1310 height--; | |
1311 | |
1312 for(y=0; y<h; y++){ | |
1313 int x; | |
1314 | |
1315 vx= ox; | |
1316 vy= oy; | |
1317 for(x=0; x<8; x++){ //XXX FIXME optimize | |
1318 int src_x, src_y, frac_x, frac_y, index; | |
1319 | |
1320 src_x= vx>>16; | |
1321 src_y= vy>>16; | |
1322 frac_x= src_x&(s-1); | |
1323 frac_y= src_y&(s-1); | |
1324 src_x>>=shift; | |
1325 src_y>>=shift; | |
2967 | 1326 |
753 | 1327 if((unsigned)src_x < width){ |
1328 if((unsigned)src_y < height){ | |
1329 index= src_x + src_y*stride; | |
1330 dst[y*stride + x]= ( ( src[index ]*(s-frac_x) | |
1331 + src[index +1]* frac_x )*(s-frac_y) | |
1332 + ( src[index+stride ]*(s-frac_x) | |
1333 + src[index+stride+1]* frac_x )* frac_y | |
1334 + r)>>(shift*2); | |
1335 }else{ | |
4594 | 1336 index= src_x + av_clip(src_y, 0, height)*stride; |
2967 | 1337 dst[y*stride + x]= ( ( src[index ]*(s-frac_x) |
753 | 1338 + src[index +1]* frac_x )*s |
1339 + r)>>(shift*2); | |
1340 } | |
1341 }else{ | |
1342 if((unsigned)src_y < height){ | |
4594 | 1343 index= av_clip(src_x, 0, width) + src_y*stride; |
2967 | 1344 dst[y*stride + x]= ( ( src[index ]*(s-frac_y) |
753 | 1345 + src[index+stride ]* frac_y )*s |
1346 + r)>>(shift*2); | |
1347 }else{ | |
4594 | 1348 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride; |
753 | 1349 dst[y*stride + x]= src[index ]; |
1350 } | |
1351 } | |
2967 | 1352 |
753 | 1353 vx+= dxx; |
1354 vy+= dyx; | |
1355 } | |
1356 ox += dxy; | |
1357 oy += dyy; | |
1358 } | |
1359 } | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1360 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1361 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1362 switch(width){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1363 case 2: put_pixels2_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1364 case 4: put_pixels4_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1365 case 8: put_pixels8_c (dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1366 case 16:put_pixels16_c(dst, src, stride, height); break; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1367 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1368 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1369 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1370 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1371 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1372 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1373 for (j=0; j < width; j++) { |
2979 | 1374 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1375 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1376 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1377 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1378 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1379 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1380 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1381 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1382 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1383 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1384 for (j=0; j < width; j++) { |
2979 | 1385 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1386 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1387 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1388 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1389 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1390 } |
2967 | 1391 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1392 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1393 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1394 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1395 for (j=0; j < width; j++) { |
2979 | 1396 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1397 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1398 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1399 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1400 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1401 } |
2967 | 1402 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1403 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1404 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1405 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1406 for (j=0; j < width; j++) { |
2979 | 1407 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1408 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1409 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1410 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1411 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1412 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1413 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1414 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1415 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1416 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1417 for (j=0; j < width; j++) { |
2979 | 1418 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1419 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1420 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1421 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1422 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1423 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1424 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1425 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1426 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1427 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1428 for (j=0; j < width; j++) { |
2979 | 1429 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1430 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1431 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1432 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1433 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1434 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1435 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1436 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1437 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1438 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1439 for (j=0; j < width; j++) { |
2979 | 1440 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1441 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1442 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1443 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1444 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1445 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1446 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1447 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1448 int i,j; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1449 for (i=0; i < height; i++) { |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1450 for (j=0; j < width; j++) { |
2979 | 1451 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15; |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1452 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1453 src += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1454 dst += stride; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1455 } |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1456 } |
1319 | 1457 |
1458 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1459 switch(width){ | |
1460 case 2: avg_pixels2_c (dst, src, stride, height); break; | |
1461 case 4: avg_pixels4_c (dst, src, stride, height); break; | |
1462 case 8: avg_pixels8_c (dst, src, stride, height); break; | |
1463 case 16:avg_pixels16_c(dst, src, stride, height); break; | |
1464 } | |
1465 } | |
1466 | |
1467 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1468 int i,j; | |
1469 for (i=0; i < height; i++) { | |
1470 for (j=0; j < width; j++) { | |
2979 | 1471 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1; |
1319 | 1472 } |
1473 src += stride; | |
1474 dst += stride; | |
1475 } | |
1476 } | |
1477 | |
1478 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1479 int i,j; | |
1480 for (i=0; i < height; i++) { | |
1481 for (j=0; j < width; j++) { | |
2979 | 1482 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1; |
1319 | 1483 } |
1484 src += stride; | |
1485 dst += stride; | |
1486 } | |
1487 } | |
2967 | 1488 |
1319 | 1489 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
1490 int i,j; | |
1491 for (i=0; i < height; i++) { | |
1492 for (j=0; j < width; j++) { | |
2979 | 1493 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1; |
1319 | 1494 } |
1495 src += stride; | |
1496 dst += stride; | |
1497 } | |
1498 } | |
2967 | 1499 |
1319 | 1500 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ |
1501 int i,j; | |
1502 for (i=0; i < height; i++) { | |
1503 for (j=0; j < width; j++) { | |
2979 | 1504 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1505 } |
1506 src += stride; | |
1507 dst += stride; | |
1508 } | |
1509 } | |
1510 | |
1511 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1512 int i,j; | |
1513 for (i=0; i < height; i++) { | |
1514 for (j=0; j < width; j++) { | |
2979 | 1515 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1516 } |
1517 src += stride; | |
1518 dst += stride; | |
1519 } | |
1520 } | |
1521 | |
1522 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1523 int i,j; | |
1524 for (i=0; i < height; i++) { | |
1525 for (j=0; j < width; j++) { | |
2979 | 1526 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1; |
1319 | 1527 } |
1528 src += stride; | |
1529 dst += stride; | |
1530 } | |
1531 } | |
1532 | |
1533 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1534 int i,j; | |
1535 for (i=0; i < height; i++) { | |
1536 for (j=0; j < width; j++) { | |
2979 | 1537 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1538 } |
1539 src += stride; | |
1540 dst += stride; | |
1541 } | |
1542 } | |
1543 | |
1544 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){ | |
1545 int i,j; | |
1546 for (i=0; i < height; i++) { | |
1547 for (j=0; j < width; j++) { | |
2979 | 1548 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1; |
1319 | 1549 } |
1550 src += stride; | |
1551 dst += stride; | |
1552 } | |
1553 } | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1554 #if 0 |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1555 #define TPEL_WIDTH(width)\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1556 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1557 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1558 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1559 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1560 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1561 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1562 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1563 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1564 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1565 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1566 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1567 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1568 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1569 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1570 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1571 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1572 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1573 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);} |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1574 #endif |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
1575 |
1168 | 1576 #define H264_CHROMA_MC(OPNAME, OP)\ |
1577 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1578 const int A=(8-x)*(8-y);\ | |
1579 const int B=( x)*(8-y);\ | |
1580 const int C=(8-x)*( y);\ | |
1581 const int D=( x)*( y);\ | |
1582 int i;\ | |
1583 \ | |
1584 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1585 \ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1586 if(D){\ |
6054 | 1587 for(i=0; i<h; i++){\ |
6053 | 1588 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1589 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1590 dst+= stride;\ | |
1591 src+= stride;\ | |
1592 }\ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1593 }else{\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1594 const int E= B+C;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1595 const int step= C ? stride : 1;\ |
6054 | 1596 for(i=0; i<h; i++){\ |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1597 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1598 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1599 dst+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1600 src+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1601 }\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1602 }\ |
1168 | 1603 }\ |
1604 \ | |
1605 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1606 const int A=(8-x)*(8-y);\ | |
1607 const int B=( x)*(8-y);\ | |
1608 const int C=(8-x)*( y);\ | |
1609 const int D=( x)*( y);\ | |
1610 int i;\ | |
1611 \ | |
1612 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1613 \ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1614 if(D){\ |
6054 | 1615 for(i=0; i<h; i++){\ |
6053 | 1616 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1617 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1618 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ | |
1619 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ | |
1620 dst+= stride;\ | |
1621 src+= stride;\ | |
1622 }\ | |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1623 }else{\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1624 const int E= B+C;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1625 const int step= C ? stride : 1;\ |
6054 | 1626 for(i=0; i<h; i++){\ |
6052
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1627 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1628 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1629 OP(dst[2], (A*src[2] + E*src[step+2]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1630 OP(dst[3], (A*src[3] + E*src[step+3]));\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1631 dst+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1632 src+= stride;\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1633 }\ |
c90798ac28ee
~15% faster h264_chroma_mc2/4_c() these also prevent some possible out
michael
parents:
6051
diff
changeset
|
1634 }\ |
1168 | 1635 }\ |
1636 \ | |
1637 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\ | |
1638 const int A=(8-x)*(8-y);\ | |
1639 const int B=( x)*(8-y);\ | |
1640 const int C=(8-x)*( y);\ | |
1641 const int D=( x)*( y);\ | |
1642 int i;\ | |
1643 \ | |
1644 assert(x<8 && y<8 && x>=0 && y>=0);\ | |
1645 \ | |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1646 if(D){\ |
6054 | 1647 for(i=0; i<h; i++){\ |
6053 | 1648 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\ |
1649 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\ | |
1650 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\ | |
1651 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\ | |
1652 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\ | |
1653 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\ | |
1654 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\ | |
1655 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\ | |
1656 dst+= stride;\ | |
1657 src+= stride;\ | |
1658 }\ | |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1659 }else{\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1660 const int E= B+C;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1661 const int step= C ? stride : 1;\ |
6054 | 1662 for(i=0; i<h; i++){\ |
6051
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1663 OP(dst[0], (A*src[0] + E*src[step+0]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1664 OP(dst[1], (A*src[1] + E*src[step+1]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1665 OP(dst[2], (A*src[2] + E*src[step+2]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1666 OP(dst[3], (A*src[3] + E*src[step+3]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1667 OP(dst[4], (A*src[4] + E*src[step+4]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1668 OP(dst[5], (A*src[5] + E*src[step+5]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1669 OP(dst[6], (A*src[6] + E*src[step+6]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1670 OP(dst[7], (A*src[7] + E*src[step+7]));\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1671 dst+= stride;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1672 src+= stride;\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1673 }\ |
1e3b5597505a
30% faster h264_chroma_mc8_c(), this also prevents a possible out of
michael
parents:
6001
diff
changeset
|
1674 }\ |
1168 | 1675 } |
1676 | |
1677 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1) | |
1678 #define op_put(a, b) a = (((b) + 32)>>6) | |
1679 | |
1680 H264_CHROMA_MC(put_ , op_put) | |
1681 H264_CHROMA_MC(avg_ , op_avg) | |
1682 #undef op_avg | |
1683 #undef op_put | |
1684 | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9437
diff
changeset
|
1685 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ |
3663 | 1686 const int A=(8-x)*(8-y); |
1687 const int B=( x)*(8-y); | |
1688 const int C=(8-x)*( y); | |
1689 const int D=( x)*( y); | |
1690 int i; | |
1691 | |
1692 assert(x<8 && y<8 && x>=0 && y>=0); | |
1693 | |
1694 for(i=0; i<h; i++) | |
1695 { | |
1696 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6; | |
1697 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6; | |
1698 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6; | |
1699 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6; | |
1700 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6; | |
1701 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6; | |
1702 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6; | |
1703 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6; | |
1704 dst+= stride; | |
1705 src+= stride; | |
1706 } | |
1707 } | |
1708 | |
9440 | 1709 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ |
1710 const int A=(8-x)*(8-y); | |
1711 const int B=( x)*(8-y); | |
1712 const int C=(8-x)*( y); | |
1713 const int D=( x)*( y); | |
1714 int i; | |
1715 | |
1716 assert(x<8 && y<8 && x>=0 && y>=0); | |
1717 | |
1718 for(i=0; i<h; i++) | |
1719 { | |
1720 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6)); | |
1721 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6)); | |
1722 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6)); | |
1723 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6)); | |
1724 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6)); | |
1725 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6)); | |
1726 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6)); | |
1727 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6)); | |
1728 dst+= stride; | |
1729 src+= stride; | |
1730 } | |
1731 } | |
1732 | |
651 | 1733 #define QPEL_MC(r, OPNAME, RND, OP) \ |
1064 | 1734 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
4176 | 1735 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1736 int i;\ |
1737 for(i=0; i<h; i++)\ | |
1738 {\ | |
1739 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\ | |
1740 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\ | |
1741 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\ | |
1742 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\ | |
1743 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\ | |
1744 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\ | |
1745 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\ | |
1746 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\ | |
1747 dst+=dstStride;\ | |
1748 src+=srcStride;\ | |
1749 }\ | |
1750 }\ | |
1751 \ | |
1064 | 1752 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
984 | 1753 const int w=8;\ |
4176 | 1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1755 int i;\ |
1756 for(i=0; i<w; i++)\ | |
1757 {\ | |
1758 const int src0= src[0*srcStride];\ | |
1759 const int src1= src[1*srcStride];\ | |
1760 const int src2= src[2*srcStride];\ | |
1761 const int src3= src[3*srcStride];\ | |
1762 const int src4= src[4*srcStride];\ | |
1763 const int src5= src[5*srcStride];\ | |
1764 const int src6= src[6*srcStride];\ | |
1765 const int src7= src[7*srcStride];\ | |
1766 const int src8= src[8*srcStride];\ | |
1767 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\ | |
1768 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\ | |
1769 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\ | |
1770 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\ | |
1771 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\ | |
1772 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\ | |
1773 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\ | |
1774 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\ | |
1775 dst++;\ | |
1776 src++;\ | |
1777 }\ | |
1778 }\ | |
1779 \ | |
1064 | 1780 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
4176 | 1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1782 int i;\ |
954 | 1783 \ |
651 | 1784 for(i=0; i<h; i++)\ |
1785 {\ | |
1786 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\ | |
1787 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\ | |
1788 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\ | |
1789 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\ | |
1790 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\ | |
1791 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\ | |
1792 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\ | |
1793 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\ | |
1794 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\ | |
1795 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\ | |
1796 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\ | |
1797 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\ | |
1798 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\ | |
1799 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\ | |
1800 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\ | |
1801 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\ | |
1802 dst+=dstStride;\ | |
1803 src+=srcStride;\ | |
1804 }\ | |
255 | 1805 }\ |
1806 \ | |
1064 | 1807 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
4176 | 1808 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
651 | 1809 int i;\ |
954 | 1810 const int w=16;\ |
651 | 1811 for(i=0; i<w; i++)\ |
1812 {\ | |
1813 const int src0= src[0*srcStride];\ | |
1814 const int src1= src[1*srcStride];\ | |
1815 const int src2= src[2*srcStride];\ | |
1816 const int src3= src[3*srcStride];\ | |
1817 const int src4= src[4*srcStride];\ | |
1818 const int src5= src[5*srcStride];\ | |
1819 const int src6= src[6*srcStride];\ | |
1820 const int src7= src[7*srcStride];\ | |
1821 const int src8= src[8*srcStride];\ | |
1822 const int src9= src[9*srcStride];\ | |
1823 const int src10= src[10*srcStride];\ | |
1824 const int src11= src[11*srcStride];\ | |
1825 const int src12= src[12*srcStride];\ | |
1826 const int src13= src[13*srcStride];\ | |
1827 const int src14= src[14*srcStride];\ | |
1828 const int src15= src[15*srcStride];\ | |
1829 const int src16= src[16*srcStride];\ | |
1830 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\ | |
1831 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\ | |
1832 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\ | |
1833 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\ | |
1834 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\ | |
1835 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\ | |
1836 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\ | |
1837 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\ | |
1838 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\ | |
1839 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\ | |
1840 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\ | |
1841 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\ | |
1842 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\ | |
1843 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\ | |
1844 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\ | |
1845 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\ | |
1846 dst++;\ | |
1847 src++;\ | |
1848 }\ | |
255 | 1849 }\ |
1850 \ | |
1064 | 1851 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ |
859 | 1852 OPNAME ## pixels8_c(dst, src, stride, 8);\ |
255 | 1853 }\ |
1854 \ | |
1064 | 1855 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ |
1856 uint8_t half[64];\ | |
651 | 1857 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ |
1858 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\ | |
1859 }\ | |
1860 \ | |
1064 | 1861 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\ |
651 | 1862 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\ |
255 | 1863 }\ |
1864 \ | |
1064 | 1865 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\ |
1866 uint8_t half[64];\ | |
651 | 1867 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ |
1868 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\ | |
1869 }\ | |
1870 \ | |
1064 | 1871 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\ |
1872 uint8_t full[16*9];\ | |
1873 uint8_t half[64];\ | |
651 | 1874 copy_block9(full, src, 16, stride, 9);\ |
984 | 1875 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\ |
651 | 1876 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\ |
1877 }\ | |
1878 \ | |
1064 | 1879 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\ |
1880 uint8_t full[16*9];\ | |
651 | 1881 copy_block9(full, src, 16, stride, 9);\ |
984 | 1882 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\ |
255 | 1883 }\ |
1884 \ | |
1064 | 1885 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\ |
1886 uint8_t full[16*9];\ | |
1887 uint8_t half[64];\ | |
651 | 1888 copy_block9(full, src, 16, stride, 9);\ |
984 | 1889 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\ |
651 | 1890 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\ |
1891 }\ | |
1064 | 1892 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1893 uint8_t full[16*9];\ | |
1894 uint8_t halfH[72];\ | |
1895 uint8_t halfV[64];\ | |
1896 uint8_t halfHV[64];\ | |
651 | 1897 copy_block9(full, src, 16, stride, 9);\ |
1898 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1899 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1900 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1901 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1902 }\ |
1064 | 1903 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\ |
1904 uint8_t full[16*9];\ | |
1905 uint8_t halfH[72];\ | |
1906 uint8_t halfHV[64];\ | |
984 | 1907 copy_block9(full, src, 16, stride, 9);\ |
1908 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1909 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1910 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1911 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ | |
1912 }\ | |
1064 | 1913 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1914 uint8_t full[16*9];\ | |
1915 uint8_t halfH[72];\ | |
1916 uint8_t halfV[64];\ | |
1917 uint8_t halfHV[64];\ | |
651 | 1918 copy_block9(full, src, 16, stride, 9);\ |
1919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1920 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1921 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1922 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1923 }\ |
1064 | 1924 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\ |
1925 uint8_t full[16*9];\ | |
1926 uint8_t halfH[72];\ | |
1927 uint8_t halfHV[64];\ | |
984 | 1928 copy_block9(full, src, 16, stride, 9);\ |
1929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1930 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1931 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1932 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ | |
1933 }\ | |
1064 | 1934 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1935 uint8_t full[16*9];\ | |
1936 uint8_t halfH[72];\ | |
1937 uint8_t halfV[64];\ | |
1938 uint8_t halfHV[64];\ | |
651 | 1939 copy_block9(full, src, 16, stride, 9);\ |
1940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1941 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1942 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1943 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
1944 }\ | |
1064 | 1945 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\ |
1946 uint8_t full[16*9];\ | |
1947 uint8_t halfH[72];\ | |
1948 uint8_t halfHV[64];\ | |
984 | 1949 copy_block9(full, src, 16, stride, 9);\ |
1950 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1951 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
1952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1953 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ | |
1954 }\ | |
1064 | 1955 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1956 uint8_t full[16*9];\ | |
1957 uint8_t halfH[72];\ | |
1958 uint8_t halfV[64];\ | |
1959 uint8_t halfHV[64];\ | |
651 | 1960 copy_block9(full, src, 16, stride, 9);\ |
1961 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\ | |
984 | 1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1964 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\ |
255 | 1965 }\ |
1064 | 1966 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\ |
1967 uint8_t full[16*9];\ | |
1968 uint8_t halfH[72];\ | |
1969 uint8_t halfHV[64];\ | |
984 | 1970 copy_block9(full, src, 16, stride, 9);\ |
1971 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
1972 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
1973 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
1974 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ | |
1975 }\ | |
1064 | 1976 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\ |
1977 uint8_t halfH[72];\ | |
1978 uint8_t halfHV[64];\ | |
651 | 1979 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1980 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ |
651 | 1981 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\ |
1982 }\ | |
1064 | 1983 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\ |
1984 uint8_t halfH[72];\ | |
1985 uint8_t halfHV[64];\ | |
651 | 1986 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 1987 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ |
651 | 1988 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\ |
1989 }\ | |
1064 | 1990 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
1991 uint8_t full[16*9];\ | |
1992 uint8_t halfH[72];\ | |
1993 uint8_t halfV[64];\ | |
1994 uint8_t halfHV[64];\ | |
651 | 1995 copy_block9(full, src, 16, stride, 9);\ |
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 1997 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\ |
1998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 1999 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\ |
255 | 2000 }\ |
1064 | 2001 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\ |
2002 uint8_t full[16*9];\ | |
2003 uint8_t halfH[72];\ | |
984 | 2004 copy_block9(full, src, 16, stride, 9);\ |
2005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
2006 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\ | |
2007 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ | |
2008 }\ | |
1064 | 2009 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2010 uint8_t full[16*9];\ | |
2011 uint8_t halfH[72];\ | |
2012 uint8_t halfV[64];\ | |
2013 uint8_t halfHV[64];\ | |
651 | 2014 copy_block9(full, src, 16, stride, 9);\ |
2015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
984 | 2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\ |
2017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\ | |
651 | 2018 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\ |
2019 }\ | |
1064 | 2020 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\ |
2021 uint8_t full[16*9];\ | |
2022 uint8_t halfH[72];\ | |
984 | 2023 copy_block9(full, src, 16, stride, 9);\ |
2024 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\ | |
2025 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\ | |
2026 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ | |
2027 }\ | |
1064 | 2028 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ |
2029 uint8_t halfH[72];\ | |
651 | 2030 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ |
984 | 2031 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ |
651 | 2032 }\ |
1064 | 2033 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ |
859 | 2034 OPNAME ## pixels16_c(dst, src, stride, 16);\ |
255 | 2035 }\ |
651 | 2036 \ |
1064 | 2037 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ |
2038 uint8_t half[256];\ | |
651 | 2039 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\ |
2040 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\ | |
2041 }\ | |
2042 \ | |
1064 | 2043 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\ |
651 | 2044 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\ |
2045 }\ | |
2046 \ | |
1064 | 2047 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\ |
2048 uint8_t half[256];\ | |
651 | 2049 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\ |
2050 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\ | |
2051 }\ | |
2052 \ | |
1064 | 2053 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\ |
2054 uint8_t full[24*17];\ | |
2055 uint8_t half[256];\ | |
651 | 2056 copy_block17(full, src, 24, stride, 17);\ |
954 | 2057 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\ |
651 | 2058 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\ |
255 | 2059 }\ |
651 | 2060 \ |
1064 | 2061 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\ |
2062 uint8_t full[24*17];\ | |
651 | 2063 copy_block17(full, src, 24, stride, 17);\ |
954 | 2064 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\ |
651 | 2065 }\ |
2066 \ | |
1064 | 2067 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\ |
2068 uint8_t full[24*17];\ | |
2069 uint8_t half[256];\ | |
651 | 2070 copy_block17(full, src, 24, stride, 17);\ |
954 | 2071 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\ |
651 | 2072 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\ |
255 | 2073 }\ |
1064 | 2074 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2075 uint8_t full[24*17];\ | |
2076 uint8_t halfH[272];\ | |
2077 uint8_t halfV[256];\ | |
2078 uint8_t halfHV[256];\ | |
651 | 2079 copy_block17(full, src, 24, stride, 17);\ |
2080 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2081 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2082 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2083 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2084 }\ | |
1064 | 2085 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\ |
2086 uint8_t full[24*17];\ | |
2087 uint8_t halfH[272];\ | |
2088 uint8_t halfHV[256];\ | |
984 | 2089 copy_block17(full, src, 24, stride, 17);\ |
2090 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2091 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2092 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2093 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ | |
2094 }\ | |
1064 | 2095 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2096 uint8_t full[24*17];\ | |
2097 uint8_t halfH[272];\ | |
2098 uint8_t halfV[256];\ | |
2099 uint8_t halfHV[256];\ | |
651 | 2100 copy_block17(full, src, 24, stride, 17);\ |
2101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2102 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2103 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2104 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2105 }\ | |
1064 | 2106 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\ |
2107 uint8_t full[24*17];\ | |
2108 uint8_t halfH[272];\ | |
2109 uint8_t halfHV[256];\ | |
984 | 2110 copy_block17(full, src, 24, stride, 17);\ |
2111 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2112 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2113 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2114 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ | |
2115 }\ | |
1064 | 2116 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2117 uint8_t full[24*17];\ | |
2118 uint8_t halfH[272];\ | |
2119 uint8_t halfV[256];\ | |
2120 uint8_t halfHV[256];\ | |
651 | 2121 copy_block17(full, src, 24, stride, 17);\ |
2122 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2123 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2124 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2125 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
255 | 2126 }\ |
1064 | 2127 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\ |
2128 uint8_t full[24*17];\ | |
2129 uint8_t halfH[272];\ | |
2130 uint8_t halfHV[256];\ | |
984 | 2131 copy_block17(full, src, 24, stride, 17);\ |
2132 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2133 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2134 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2135 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ | |
2136 }\ | |
1064 | 2137 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2138 uint8_t full[24*17];\ | |
2139 uint8_t halfH[272];\ | |
2140 uint8_t halfV[256];\ | |
2141 uint8_t halfHV[256];\ | |
651 | 2142 copy_block17(full, src, 24, stride, 17);\ |
2143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\ | |
954 | 2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2146 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\ |
2147 }\ | |
1064 | 2148 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\ |
2149 uint8_t full[24*17];\ | |
2150 uint8_t halfH[272];\ | |
2151 uint8_t halfHV[256];\ | |
984 | 2152 copy_block17(full, src, 24, stride, 17);\ |
2153 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2154 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2155 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
2156 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ | |
2157 }\ | |
1064 | 2158 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\ |
2159 uint8_t halfH[272];\ | |
2160 uint8_t halfHV[256];\ | |
651 | 2161 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2162 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ |
651 | 2163 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\ |
255 | 2164 }\ |
1064 | 2165 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\ |
2166 uint8_t halfH[272];\ | |
2167 uint8_t halfHV[256];\ | |
651 | 2168 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2169 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ |
651 | 2170 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\ |
2171 }\ | |
1064 | 2172 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2173 uint8_t full[24*17];\ | |
2174 uint8_t halfH[272];\ | |
2175 uint8_t halfV[256];\ | |
2176 uint8_t halfHV[256];\ | |
651 | 2177 copy_block17(full, src, 24, stride, 17);\ |
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2179 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\ |
2180 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2181 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\ |
255 | 2182 }\ |
1064 | 2183 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\ |
2184 uint8_t full[24*17];\ | |
2185 uint8_t halfH[272];\ | |
984 | 2186 copy_block17(full, src, 24, stride, 17);\ |
2187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2188 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\ | |
2189 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ | |
2190 }\ | |
1064 | 2191 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\ |
2192 uint8_t full[24*17];\ | |
2193 uint8_t halfH[272];\ | |
2194 uint8_t halfV[256];\ | |
2195 uint8_t halfHV[256];\ | |
651 | 2196 copy_block17(full, src, 24, stride, 17);\ |
2197 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
954 | 2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\ |
2199 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\ | |
651 | 2200 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\ |
2201 }\ | |
1064 | 2202 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\ |
2203 uint8_t full[24*17];\ | |
2204 uint8_t halfH[272];\ | |
984 | 2205 copy_block17(full, src, 24, stride, 17);\ |
2206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\ | |
2207 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\ | |
2208 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ | |
2209 }\ | |
1064 | 2210 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ |
2211 uint8_t halfH[272];\ | |
651 | 2212 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\ |
954 | 2213 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\ |
859 | 2214 } |
255 | 2215 |
651 | 2216 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) |
2217 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1) | |
2218 #define op_put(a, b) a = cm[((b) + 16)>>5] | |
2219 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5] | |
2220 | |
2221 QPEL_MC(0, put_ , _ , op_put) | |
2222 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd) | |
2223 QPEL_MC(0, avg_ , _ , op_avg) | |
2224 //QPEL_MC(1, avg_no_rnd , _ , op_avg) | |
2225 #undef op_avg | |
2226 #undef op_avg_no_rnd | |
2227 #undef op_put | |
2228 #undef op_put_no_rnd | |
255 | 2229 |
1168 | 2230 #if 1 |
2231 #define H264_LOWPASS(OPNAME, OP, OP2) \ | |
5151 | 2232 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2233 const int h=2;\ |
4176 | 2234 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2235 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2236 for(i=0; i<h; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2237 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2238 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2239 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2240 dst+=dstStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2241 src+=srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2242 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2243 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2244 \ |
5151 | 2245 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2246 const int w=2;\ |
4176 | 2247 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2248 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2249 for(i=0; i<w; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2250 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2251 const int srcB= src[-2*srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2252 const int srcA= src[-1*srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2253 const int src0= src[0 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2254 const int src1= src[1 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2255 const int src2= src[2 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2256 const int src3= src[3 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2257 const int src4= src[4 *srcStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2258 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2259 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2260 dst++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2261 src++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2262 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2263 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2264 \ |
5151 | 2265 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2266 const int h=2;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2267 const int w=2;\ |
4176 | 2268 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2269 int i;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2270 src -= 2*srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2271 for(i=0; i<h+5; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2272 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2273 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2274 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2275 tmp+=tmpStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2276 src+=srcStride;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2277 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2278 tmp -= tmpStride*(h+5-2);\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2279 for(i=0; i<w; i++)\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2280 {\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2281 const int tmpB= tmp[-2*tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2282 const int tmpA= tmp[-1*tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2283 const int tmp0= tmp[0 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2284 const int tmp1= tmp[1 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2285 const int tmp2= tmp[2 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2286 const int tmp3= tmp[3 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2287 const int tmp4= tmp[4 *tmpStride];\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2288 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2289 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2290 dst++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2291 tmp++;\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2292 }\ |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2293 }\ |
1168 | 2294 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
2295 const int h=4;\ | |
4176 | 2296 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2297 int i;\ |
2298 for(i=0; i<h; i++)\ | |
2299 {\ | |
2300 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ | |
2301 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ | |
2302 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ | |
2303 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ | |
2304 dst+=dstStride;\ | |
2305 src+=srcStride;\ | |
2306 }\ | |
2307 }\ | |
2308 \ | |
2309 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2310 const int w=4;\ | |
4176 | 2311 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2312 int i;\ |
2313 for(i=0; i<w; i++)\ | |
2314 {\ | |
2315 const int srcB= src[-2*srcStride];\ | |
2316 const int srcA= src[-1*srcStride];\ | |
2317 const int src0= src[0 *srcStride];\ | |
2318 const int src1= src[1 *srcStride];\ | |
2319 const int src2= src[2 *srcStride];\ | |
2320 const int src3= src[3 *srcStride];\ | |
2321 const int src4= src[4 *srcStride];\ | |
2322 const int src5= src[5 *srcStride];\ | |
2323 const int src6= src[6 *srcStride];\ | |
2324 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ | |
2325 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ | |
2326 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ | |
2327 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ | |
2328 dst++;\ | |
2329 src++;\ | |
2330 }\ | |
2331 }\ | |
2332 \ | |
2333 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2334 const int h=4;\ | |
2335 const int w=4;\ | |
4176 | 2336 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2337 int i;\ |
2338 src -= 2*srcStride;\ | |
2339 for(i=0; i<h+5; i++)\ | |
2340 {\ | |
2341 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\ | |
2342 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\ | |
2343 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\ | |
2344 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\ | |
2345 tmp+=tmpStride;\ | |
2346 src+=srcStride;\ | |
2347 }\ | |
2348 tmp -= tmpStride*(h+5-2);\ | |
2349 for(i=0; i<w; i++)\ | |
2350 {\ | |
2351 const int tmpB= tmp[-2*tmpStride];\ | |
2352 const int tmpA= tmp[-1*tmpStride];\ | |
2353 const int tmp0= tmp[0 *tmpStride];\ | |
2354 const int tmp1= tmp[1 *tmpStride];\ | |
2355 const int tmp2= tmp[2 *tmpStride];\ | |
2356 const int tmp3= tmp[3 *tmpStride];\ | |
2357 const int tmp4= tmp[4 *tmpStride];\ | |
2358 const int tmp5= tmp[5 *tmpStride];\ | |
2359 const int tmp6= tmp[6 *tmpStride];\ | |
2360 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ | |
2361 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ | |
2362 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ | |
2363 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ | |
2364 dst++;\ | |
2365 tmp++;\ | |
2366 }\ | |
2367 }\ | |
2368 \ | |
2369 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2370 const int h=8;\ | |
4176 | 2371 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2372 int i;\ |
2373 for(i=0; i<h; i++)\ | |
2374 {\ | |
2375 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\ | |
2376 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\ | |
2377 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\ | |
2378 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\ | |
2379 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\ | |
2380 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\ | |
2381 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\ | |
2382 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\ | |
2383 dst+=dstStride;\ | |
2384 src+=srcStride;\ | |
2385 }\ | |
2386 }\ | |
2387 \ | |
2388 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2389 const int w=8;\ | |
4176 | 2390 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2391 int i;\ |
2392 for(i=0; i<w; i++)\ | |
2393 {\ | |
2394 const int srcB= src[-2*srcStride];\ | |
2395 const int srcA= src[-1*srcStride];\ | |
2396 const int src0= src[0 *srcStride];\ | |
2397 const int src1= src[1 *srcStride];\ | |
2398 const int src2= src[2 *srcStride];\ | |
2399 const int src3= src[3 *srcStride];\ | |
2400 const int src4= src[4 *srcStride];\ | |
2401 const int src5= src[5 *srcStride];\ | |
2402 const int src6= src[6 *srcStride];\ | |
2403 const int src7= src[7 *srcStride];\ | |
2404 const int src8= src[8 *srcStride];\ | |
2405 const int src9= src[9 *srcStride];\ | |
2406 const int src10=src[10*srcStride];\ | |
2407 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ | |
2408 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ | |
2409 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ | |
2410 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ | |
2411 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\ | |
2412 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\ | |
2413 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\ | |
2414 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\ | |
2415 dst++;\ | |
2416 src++;\ | |
2417 }\ | |
2418 }\ | |
2419 \ | |
2420 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2421 const int h=8;\ | |
2422 const int w=8;\ | |
4176 | 2423 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\ |
1168 | 2424 int i;\ |
2425 src -= 2*srcStride;\ | |
2426 for(i=0; i<h+5; i++)\ | |
2427 {\ | |
2428 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\ | |
2429 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\ | |
2430 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\ | |
2431 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\ | |
2432 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\ | |
2433 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\ | |
2434 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\ | |
2435 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\ | |
2436 tmp+=tmpStride;\ | |
2437 src+=srcStride;\ | |
2438 }\ | |
2439 tmp -= tmpStride*(h+5-2);\ | |
2440 for(i=0; i<w; i++)\ | |
2441 {\ | |
2442 const int tmpB= tmp[-2*tmpStride];\ | |
2443 const int tmpA= tmp[-1*tmpStride];\ | |
2444 const int tmp0= tmp[0 *tmpStride];\ | |
2445 const int tmp1= tmp[1 *tmpStride];\ | |
2446 const int tmp2= tmp[2 *tmpStride];\ | |
2447 const int tmp3= tmp[3 *tmpStride];\ | |
2448 const int tmp4= tmp[4 *tmpStride];\ | |
2449 const int tmp5= tmp[5 *tmpStride];\ | |
2450 const int tmp6= tmp[6 *tmpStride];\ | |
2451 const int tmp7= tmp[7 *tmpStride];\ | |
2452 const int tmp8= tmp[8 *tmpStride];\ | |
2453 const int tmp9= tmp[9 *tmpStride];\ | |
2454 const int tmp10=tmp[10*tmpStride];\ | |
2455 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\ | |
2456 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\ | |
2457 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\ | |
2458 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\ | |
2459 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\ | |
2460 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\ | |
2461 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\ | |
2462 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\ | |
2463 dst++;\ | |
2464 tmp++;\ | |
2465 }\ | |
2466 }\ | |
2467 \ | |
2468 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2469 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ | |
2470 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2471 src += 8*srcStride;\ | |
2472 dst += 8*dstStride;\ | |
2473 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\ | |
2474 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2475 }\ | |
2476 \ | |
2477 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | |
2478 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ | |
2479 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2480 src += 8*srcStride;\ | |
2481 dst += 8*dstStride;\ | |
2482 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\ | |
2483 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\ | |
2484 }\ | |
2485 \ | |
2486 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | |
2487 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2488 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2489 src += 8*srcStride;\ | |
2490 dst += 8*dstStride;\ | |
2491 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\ | |
2492 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\ | |
2493 }\ | |
2494 | |
2495 #define H264_MC(OPNAME, SIZE) \ | |
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\ | |
2497 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ | |
2498 }\ | |
2499 \ | |
2500 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2501 uint8_t half[SIZE*SIZE];\ | |
2502 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ | |
2503 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\ | |
2504 }\ | |
2505 \ | |
2506 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2507 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\ | |
2508 }\ | |
2509 \ | |
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2511 uint8_t half[SIZE*SIZE];\ | |
2512 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\ | |
2513 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\ | |
2514 }\ | |
2515 \ | |
2516 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2517 uint8_t full[SIZE*(SIZE+5)];\ | |
2518 uint8_t * const full_mid= full + SIZE*2;\ | |
2519 uint8_t half[SIZE*SIZE];\ | |
2520 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2521 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ | |
2522 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\ | |
2523 }\ | |
2524 \ | |
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2526 uint8_t full[SIZE*(SIZE+5)];\ | |
2527 uint8_t * const full_mid= full + SIZE*2;\ | |
2528 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2529 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\ | |
2530 }\ | |
2531 \ | |
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2533 uint8_t full[SIZE*(SIZE+5)];\ | |
2534 uint8_t * const full_mid= full + SIZE*2;\ | |
2535 uint8_t half[SIZE*SIZE];\ | |
2536 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2537 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\ | |
2538 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\ | |
2539 }\ | |
2540 \ | |
2541 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2542 uint8_t full[SIZE*(SIZE+5)];\ | |
2543 uint8_t * const full_mid= full + SIZE*2;\ | |
2544 uint8_t halfH[SIZE*SIZE];\ | |
2545 uint8_t halfV[SIZE*SIZE];\ | |
2546 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2547 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2548 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2549 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2550 }\ | |
2551 \ | |
2552 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2553 uint8_t full[SIZE*(SIZE+5)];\ | |
2554 uint8_t * const full_mid= full + SIZE*2;\ | |
2555 uint8_t halfH[SIZE*SIZE];\ | |
2556 uint8_t halfV[SIZE*SIZE];\ | |
2557 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2558 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2559 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2560 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2561 }\ | |
2562 \ | |
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2564 uint8_t full[SIZE*(SIZE+5)];\ | |
2565 uint8_t * const full_mid= full + SIZE*2;\ | |
2566 uint8_t halfH[SIZE*SIZE];\ | |
2567 uint8_t halfV[SIZE*SIZE];\ | |
2568 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2569 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2570 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2571 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2572 }\ | |
2573 \ | |
2574 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2575 uint8_t full[SIZE*(SIZE+5)];\ | |
2576 uint8_t * const full_mid= full + SIZE*2;\ | |
2577 uint8_t halfH[SIZE*SIZE];\ | |
2578 uint8_t halfV[SIZE*SIZE];\ | |
2579 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2580 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2581 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2582 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ | |
2583 }\ | |
2584 \ | |
2585 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2586 int16_t tmp[SIZE*(SIZE+5)];\ | |
2587 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\ | |
2588 }\ | |
2589 \ | |
2590 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2591 int16_t tmp[SIZE*(SIZE+5)];\ | |
2592 uint8_t halfH[SIZE*SIZE];\ | |
2593 uint8_t halfHV[SIZE*SIZE];\ | |
2594 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\ | |
2595 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2596 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2597 }\ | |
2598 \ | |
2599 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2600 int16_t tmp[SIZE*(SIZE+5)];\ | |
2601 uint8_t halfH[SIZE*SIZE];\ | |
2602 uint8_t halfHV[SIZE*SIZE];\ | |
2603 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\ | |
2604 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2605 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2606 }\ | |
2607 \ | |
2608 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2609 uint8_t full[SIZE*(SIZE+5)];\ | |
2610 uint8_t * const full_mid= full + SIZE*2;\ | |
2611 int16_t tmp[SIZE*(SIZE+5)];\ | |
2612 uint8_t halfV[SIZE*SIZE];\ | |
2613 uint8_t halfHV[SIZE*SIZE];\ | |
2614 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ | |
2615 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2616 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2617 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2618 }\ | |
2619 \ | |
2620 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\ | |
2621 uint8_t full[SIZE*(SIZE+5)];\ | |
2622 uint8_t * const full_mid= full + SIZE*2;\ | |
2623 int16_t tmp[SIZE*(SIZE+5)];\ | |
2624 uint8_t halfV[SIZE*SIZE];\ | |
2625 uint8_t halfHV[SIZE*SIZE];\ | |
2626 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ | |
2627 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\ | |
2628 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\ | |
2629 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ | |
2630 }\ | |
2631 | |
2632 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1) | |
2633 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7) | |
2634 #define op_put(a, b) a = cm[((b) + 16)>>5] | |
2635 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1) | |
2636 #define op2_put(a, b) a = cm[((b) + 512)>>10] | |
2637 | |
2638 H264_LOWPASS(put_ , op_put, op2_put) | |
2639 H264_LOWPASS(avg_ , op_avg, op2_avg) | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
2640 H264_MC(put_, 2) |
1168 | 2641 H264_MC(put_, 4) |
2642 H264_MC(put_, 8) | |
2643 H264_MC(put_, 16) | |
2644 H264_MC(avg_, 4) | |
2645 H264_MC(avg_, 8) | |
2646 H264_MC(avg_, 16) | |
2647 | |
2648 #undef op_avg | |
2649 #undef op_put | |
2650 #undef op2_avg | |
2651 #undef op2_put | |
2652 #endif | |
2653 | |
4594 | 2654 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom ) |
2655 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) | |
2415 | 2656 #define H264_WEIGHT(W,H) \ |
2657 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ | |
3029 | 2658 int y; \ |
2415 | 2659 offset <<= log2_denom; \ |
2660 if(log2_denom) offset += 1<<(log2_denom-1); \ | |
2661 for(y=0; y<H; y++, block += stride){ \ | |
2662 op_scale1(0); \ | |
2663 op_scale1(1); \ | |
2664 if(W==2) continue; \ | |
2665 op_scale1(2); \ | |
2666 op_scale1(3); \ | |
2667 if(W==4) continue; \ | |
2668 op_scale1(4); \ | |
2669 op_scale1(5); \ | |
2670 op_scale1(6); \ | |
2671 op_scale1(7); \ | |
2672 if(W==8) continue; \ | |
2673 op_scale1(8); \ | |
2674 op_scale1(9); \ | |
2675 op_scale1(10); \ | |
2676 op_scale1(11); \ | |
2677 op_scale1(12); \ | |
2678 op_scale1(13); \ | |
2679 op_scale1(14); \ | |
2680 op_scale1(15); \ | |
2681 } \ | |
2682 } \ | |
3029 | 2683 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ |
2684 int y; \ | |
2685 offset = ((offset + 1) | 1) << log2_denom; \ | |
2415 | 2686 for(y=0; y<H; y++, dst += stride, src += stride){ \ |
2687 op_scale2(0); \ | |
2688 op_scale2(1); \ | |
2689 if(W==2) continue; \ | |
2690 op_scale2(2); \ | |
2691 op_scale2(3); \ | |
2692 if(W==4) continue; \ | |
2693 op_scale2(4); \ | |
2694 op_scale2(5); \ | |
2695 op_scale2(6); \ | |
2696 op_scale2(7); \ | |
2697 if(W==8) continue; \ | |
2698 op_scale2(8); \ | |
2699 op_scale2(9); \ | |
2700 op_scale2(10); \ | |
2701 op_scale2(11); \ | |
2702 op_scale2(12); \ | |
2703 op_scale2(13); \ | |
2704 op_scale2(14); \ | |
2705 op_scale2(15); \ | |
2706 } \ | |
2707 } | |
2708 | |
2709 H264_WEIGHT(16,16) | |
2710 H264_WEIGHT(16,8) | |
2711 H264_WEIGHT(8,16) | |
2712 H264_WEIGHT(8,8) | |
2713 H264_WEIGHT(8,4) | |
2714 H264_WEIGHT(4,8) | |
2715 H264_WEIGHT(4,4) | |
2716 H264_WEIGHT(4,2) | |
2717 H264_WEIGHT(2,4) | |
2718 H264_WEIGHT(2,2) | |
2719 | |
2720 #undef op_scale1 | |
2721 #undef op_scale2 | |
2722 #undef H264_WEIGHT | |
2723 | |
936 | 2724 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){ |
4176 | 2725 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
936 | 2726 int i; |
2727 | |
2728 for(i=0; i<h; i++){ | |
2729 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4]; | |
2730 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4]; | |
2731 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4]; | |
2732 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4]; | |
2733 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4]; | |
2734 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4]; | |
2735 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4]; | |
2736 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4]; | |
2737 dst+=dstStride; | |
2967 | 2738 src+=srcStride; |
936 | 2739 } |
2740 } | |
2741 | |
8590 | 2742 #if CONFIG_CAVS_DECODER |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2743 /* AVS specific */ |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2744 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2745 |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2746 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2747 put_pixels8_c(dst, src, stride, 8); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2748 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2749 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2750 avg_pixels8_c(dst, src, stride, 8); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2751 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2752 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2753 put_pixels16_c(dst, src, stride, 16); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2754 } |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2755 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2756 avg_pixels16_c(dst, src, stride, 16); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2757 } |
3432 | 2758 #endif /* CONFIG_CAVS_DECODER */ |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
2759 |
9586
c7420bfe4da0
Don't #if a function declaration and properly indent it.
ramiro
parents:
9585
diff
changeset
|
2760 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx); |
9585 | 2761 |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
2762 #if CONFIG_VC1_DECODER |
3526 | 2763 /* VC-1 specific */ |
2764 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx); | |
2765 | |
2766 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) { | |
2767 put_pixels8_c(dst, src, stride, 8); | |
2768 } | |
9437 | 2769 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) { |
2770 avg_pixels8_c(dst, src, stride, 8); | |
2771 } | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
2772 #endif /* CONFIG_VC1_DECODER */ |
3526 | 2773 |
5887 | 2774 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx); |
5899 | 2775 |
4296 | 2776 /* H264 specific */ |
5411
362aec4ef932
Take care of some renames (Doxygen and function name) after the previous pure rename patch.
takis
parents:
5394
diff
changeset
|
2777 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx); |
4296 | 2778 |
8590 | 2779 #if CONFIG_RV30_DECODER |
8410 | 2780 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx); |
2781 #endif /* CONFIG_RV30_DECODER */ | |
2782 | |
8590 | 2783 #if CONFIG_RV40_DECODER |
8232 | 2784 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){ |
2785 put_pixels16_xy2_c(dst, src, stride, 16); | |
2786 } | |
2787 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2788 avg_pixels16_xy2_c(dst, src, stride, 16); | |
2789 } | |
2790 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2791 put_pixels8_xy2_c(dst, src, stride, 8); | |
2792 } | |
2793 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){ | |
2794 avg_pixels8_xy2_c(dst, src, stride, 8); | |
2795 } | |
2796 | |
2797 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx); | |
2798 #endif /* CONFIG_RV40_DECODER */ | |
2799 | |
936 | 2800 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){ |
4176 | 2801 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
936 | 2802 int i; |
2803 | |
2804 for(i=0; i<w; i++){ | |
2805 const int src_1= src[ -srcStride]; | |
2806 const int src0 = src[0 ]; | |
2807 const int src1 = src[ srcStride]; | |
2808 const int src2 = src[2*srcStride]; | |
2809 const int src3 = src[3*srcStride]; | |
2810 const int src4 = src[4*srcStride]; | |
2811 const int src5 = src[5*srcStride]; | |
2812 const int src6 = src[6*srcStride]; | |
2813 const int src7 = src[7*srcStride]; | |
2814 const int src8 = src[8*srcStride]; | |
2815 const int src9 = src[9*srcStride]; | |
2816 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; | |
2817 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; | |
2818 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; | |
2819 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; | |
2820 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; | |
2821 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; | |
2822 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; | |
2823 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; | |
2824 src++; | |
2825 dst++; | |
2826 } | |
2827 } | |
2828 | |
2829 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){ | |
2830 put_pixels8_c(dst, src, stride, 8); | |
2831 } | |
2832 | |
2833 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){ | |
2834 uint8_t half[64]; | |
2835 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); | |
2836 put_pixels8_l2(dst, src, half, stride, stride, 8, 8); | |
2837 } | |
2838 | |
2839 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){ | |
2840 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8); | |
2841 } | |
2842 | |
2843 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){ | |
2844 uint8_t half[64]; | |
2845 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); | |
2846 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8); | |
2847 } | |
2848 | |
2849 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){ | |
2850 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8); | |
2851 } | |
2852 | |
2853 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){ | |
2854 uint8_t halfH[88]; | |
2855 uint8_t halfV[64]; | |
2856 uint8_t halfHV[64]; | |
2857 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2858 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8); | |
2859 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8); | |
2860 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8); | |
2861 } | |
2862 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){ | |
2863 uint8_t halfH[88]; | |
2864 uint8_t halfV[64]; | |
2865 uint8_t halfHV[64]; | |
2866 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2867 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8); | |
2868 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8); | |
2869 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8); | |
2870 } | |
2871 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){ | |
2872 uint8_t halfH[88]; | |
2873 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11); | |
2874 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8); | |
2875 } | |
2876 | |
1644 | 2877 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){ |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10748
diff
changeset
|
2878 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
1644 | 2879 int x; |
2880 const int strength= ff_h263_loop_filter_strength[qscale]; | |
2967 | 2881 |
1644 | 2882 for(x=0; x<8; x++){ |
2883 int d1, d2, ad1; | |
2884 int p0= src[x-2*stride]; | |
2885 int p1= src[x-1*stride]; | |
2886 int p2= src[x+0*stride]; | |
2887 int p3= src[x+1*stride]; | |
2888 int d = (p0 - p3 + 4*(p2 - p1)) / 8; | |
2889 | |
2890 if (d<-2*strength) d1= 0; | |
2891 else if(d<- strength) d1=-2*strength - d; | |
2892 else if(d< strength) d1= d; | |
2893 else if(d< 2*strength) d1= 2*strength - d; | |
2894 else d1= 0; | |
2967 | 2895 |
1644 | 2896 p1 += d1; |
2897 p2 -= d1; | |
2898 if(p1&256) p1= ~(p1>>31); | |
2899 if(p2&256) p2= ~(p2>>31); | |
2967 | 2900 |
1644 | 2901 src[x-1*stride] = p1; |
2902 src[x+0*stride] = p2; | |
2903 | |
4001 | 2904 ad1= FFABS(d1)>>1; |
2967 | 2905 |
4594 | 2906 d2= av_clip((p0-p3)/4, -ad1, ad1); |
2967 | 2907 |
1644 | 2908 src[x-2*stride] = p0 - d2; |
2909 src[x+ stride] = p3 + d2; | |
2910 } | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5291
diff
changeset
|
2911 } |
1644 | 2912 } |
2913 | |
2914 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){ | |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10748
diff
changeset
|
2915 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
1644 | 2916 int y; |
2917 const int strength= ff_h263_loop_filter_strength[qscale]; | |
2967 | 2918 |
1644 | 2919 for(y=0; y<8; y++){ |
2920 int d1, d2, ad1; | |
2921 int p0= src[y*stride-2]; | |
2922 int p1= src[y*stride-1]; | |
2923 int p2= src[y*stride+0]; | |
2924 int p3= src[y*stride+1]; | |
2925 int d = (p0 - p3 + 4*(p2 - p1)) / 8; | |
2926 | |
2927 if (d<-2*strength) d1= 0; | |
2928 else if(d<- strength) d1=-2*strength - d; | |
2929 else if(d< strength) d1= d; | |
2930 else if(d< 2*strength) d1= 2*strength - d; | |
2931 else d1= 0; | |
2967 | 2932 |
1644 | 2933 p1 += d1; |
2934 p2 -= d1; | |
2935 if(p1&256) p1= ~(p1>>31); | |
2936 if(p2&256) p2= ~(p2>>31); | |
2967 | 2937 |
1644 | 2938 src[y*stride-1] = p1; |
2939 src[y*stride+0] = p2; | |
2940 | |
4001 | 2941 ad1= FFABS(d1)>>1; |
2967 | 2942 |
4594 | 2943 d2= av_clip((p0-p3)/4, -ad1, ad1); |
2967 | 2944 |
1644 | 2945 src[y*stride-2] = p0 - d2; |
2946 src[y*stride+1] = p3 + d2; | |
2947 } | |
5394
e9a6215f4e3a
help some gcc version to optimize out those functions
aurel
parents:
5291
diff
changeset
|
2948 } |
1644 | 2949 } |
936 | 2950 |
2045 | 2951 static void h261_loop_filter_c(uint8_t *src, int stride){ |
2952 int x,y,xy,yz; | |
2953 int temp[64]; | |
2954 | |
2955 for(x=0; x<8; x++){ | |
2956 temp[x ] = 4*src[x ]; | |
2957 temp[x + 7*8] = 4*src[x + 7*stride]; | |
2958 } | |
2959 for(y=1; y<7; y++){ | |
2960 for(x=0; x<8; x++){ | |
2961 xy = y * stride + x; | |
2962 yz = y * 8 + x; | |
2963 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride]; | |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2964 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2965 } |
2967 | 2966 |
2045 | 2967 for(y=0; y<8; y++){ |
2968 src[ y*stride] = (temp[ y*8] + 2)>>2; | |
2969 src[7+y*stride] = (temp[7+y*8] + 2)>>2; | |
2970 for(x=1; x<7; x++){ | |
2971 xy = y * stride + x; | |
2972 yz = y * 8 + x; | |
2973 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4; | |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2974 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2975 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2976 } |
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
1984
diff
changeset
|
2977 |
10941
28edcc8c54c0
Mark the h264 c loop filter as av_always_inline av_flatten to make sure its
michael
parents:
10940
diff
changeset
|
2978 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
2633 | 2979 { |
2980 int i, d; | |
2981 for( i = 0; i < 4; i++ ) { | |
2982 if( tc0[i] < 0 ) { | |
2983 pix += 4*ystride; | |
2984 continue; | |
2985 } | |
2986 for( d = 0; d < 4; d++ ) { | |
2987 const int p0 = pix[-1*xstride]; | |
2988 const int p1 = pix[-2*xstride]; | |
2989 const int p2 = pix[-3*xstride]; | |
2990 const int q0 = pix[0]; | |
2991 const int q1 = pix[1*xstride]; | |
2992 const int q2 = pix[2*xstride]; | |
2967 | 2993 |
4001 | 2994 if( FFABS( p0 - q0 ) < alpha && |
2995 FFABS( p1 - p0 ) < beta && | |
2996 FFABS( q1 - q0 ) < beta ) { | |
2967 | 2997 |
2633 | 2998 int tc = tc0[i]; |
2999 int i_delta; | |
2967 | 3000 |
4001 | 3001 if( FFABS( p2 - p0 ) < beta ) { |
10940
563cb9b1a9b7
skip outer pixels if possible in h264_loop_filter_luma_c().
michael
parents:
10878
diff
changeset
|
3002 if(tc0[i]) |
4594 | 3003 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); |
2633 | 3004 tc++; |
3005 } | |
4001 | 3006 if( FFABS( q2 - q0 ) < beta ) { |
10940
563cb9b1a9b7
skip outer pixels if possible in h264_loop_filter_luma_c().
michael
parents:
10878
diff
changeset
|
3007 if(tc0[i]) |
4594 | 3008 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); |
2633 | 3009 tc++; |
3010 } | |
2967 | 3011 |
4594 | 3012 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); |
3013 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ | |
3014 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ | |
2633 | 3015 } |
3016 pix += ystride; | |
3017 } | |
3018 } | |
3019 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3020 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3021 { |
3022 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); | |
3023 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3024 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3025 { |
3026 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0); | |
3027 } | |
3028 | |
10941
28edcc8c54c0
Mark the h264 c loop filter as av_always_inline av_flatten to make sure its
michael
parents:
10940
diff
changeset
|
3029 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) |
8395
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3030 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3031 int d; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3032 for( d = 0; d < 16; d++ ) { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3033 const int p2 = pix[-3*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3034 const int p1 = pix[-2*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3035 const int p0 = pix[-1*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3036 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3037 const int q0 = pix[ 0*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3038 const int q1 = pix[ 1*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3039 const int q2 = pix[ 2*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3040 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3041 if( FFABS( p0 - q0 ) < alpha && |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3042 FFABS( p1 - p0 ) < beta && |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3043 FFABS( q1 - q0 ) < beta ) { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3044 |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3045 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3046 if( FFABS( p2 - p0 ) < beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3047 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3048 const int p3 = pix[-4*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3049 /* p0', p1', p2' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3050 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3051 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3052 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3053 } else { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3054 /* p0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3055 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3056 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3057 if( FFABS( q2 - q0 ) < beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3058 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3059 const int q3 = pix[3*xstride]; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3060 /* q0', q1', q2' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3061 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3062 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3063 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3064 } else { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3065 /* q0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3066 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3067 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3068 }else{ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3069 /* p0', q0' */ |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3070 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3071 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3072 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3073 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3074 pix += ystride; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3075 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3076 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3077 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3078 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3079 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta); |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3080 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3081 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3082 { |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3083 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta); |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3084 } |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
3085 |
10941
28edcc8c54c0
Mark the h264 c loop filter as av_always_inline av_flatten to make sure its
michael
parents:
10940
diff
changeset
|
3086 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
2633 | 3087 { |
3088 int i, d; | |
3089 for( i = 0; i < 4; i++ ) { | |
3090 const int tc = tc0[i]; | |
3091 if( tc <= 0 ) { | |
3092 pix += 2*ystride; | |
3093 continue; | |
3094 } | |
3095 for( d = 0; d < 2; d++ ) { | |
3096 const int p0 = pix[-1*xstride]; | |
3097 const int p1 = pix[-2*xstride]; | |
3098 const int q0 = pix[0]; | |
3099 const int q1 = pix[1*xstride]; | |
3100 | |
4001 | 3101 if( FFABS( p0 - q0 ) < alpha && |
3102 FFABS( p1 - p0 ) < beta && | |
3103 FFABS( q1 - q0 ) < beta ) { | |
2633 | 3104 |
4594 | 3105 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); |
3106 | |
3107 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */ | |
3108 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */ | |
2633 | 3109 } |
3110 pix += ystride; | |
3111 } | |
3112 } | |
3113 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3114 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3115 { |
3116 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0); | |
3117 } | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3118 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
2633 | 3119 { |
3120 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0); | |
3121 } | |
3122 | |
10941
28edcc8c54c0
Mark the h264 c loop filter as av_always_inline av_flatten to make sure its
michael
parents:
10940
diff
changeset
|
3123 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta) |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3124 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3125 int d; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3126 for( d = 0; d < 8; d++ ) { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3127 const int p0 = pix[-1*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3128 const int p1 = pix[-2*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3129 const int q0 = pix[0]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3130 const int q1 = pix[1*xstride]; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3131 |
4001 | 3132 if( FFABS( p0 - q0 ) < alpha && |
3133 FFABS( p1 - p0 ) < beta && | |
3134 FFABS( q1 - q0 ) < beta ) { | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3135 |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3136 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */ |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3137 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */ |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3138 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3139 pix += ystride; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3140 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3141 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3142 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3143 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3144 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3145 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3146 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta) |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3147 { |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3148 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3149 } |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
3150 |
1708 | 3151 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3152 { |
3153 int s, i; | |
3154 | |
3155 s = 0; | |
1708 | 3156 for(i=0;i<h;i++) { |
0 | 3157 s += abs(pix1[0] - pix2[0]); |
3158 s += abs(pix1[1] - pix2[1]); | |
3159 s += abs(pix1[2] - pix2[2]); | |
3160 s += abs(pix1[3] - pix2[3]); | |
3161 s += abs(pix1[4] - pix2[4]); | |
3162 s += abs(pix1[5] - pix2[5]); | |
3163 s += abs(pix1[6] - pix2[6]); | |
3164 s += abs(pix1[7] - pix2[7]); | |
3165 s += abs(pix1[8] - pix2[8]); | |
3166 s += abs(pix1[9] - pix2[9]); | |
3167 s += abs(pix1[10] - pix2[10]); | |
3168 s += abs(pix1[11] - pix2[11]); | |
3169 s += abs(pix1[12] - pix2[12]); | |
3170 s += abs(pix1[13] - pix2[13]); | |
3171 s += abs(pix1[14] - pix2[14]); | |
3172 s += abs(pix1[15] - pix2[15]); | |
3173 pix1 += line_size; | |
3174 pix2 += line_size; | |
3175 } | |
3176 return s; | |
3177 } | |
3178 | |
1708 | 3179 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3180 { |
3181 int s, i; | |
3182 | |
3183 s = 0; | |
1708 | 3184 for(i=0;i<h;i++) { |
0 | 3185 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
3186 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
3187 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
3188 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
3189 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
3190 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
3191 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
3192 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
3193 s += abs(pix1[8] - avg2(pix2[8], pix2[9])); | |
3194 s += abs(pix1[9] - avg2(pix2[9], pix2[10])); | |
3195 s += abs(pix1[10] - avg2(pix2[10], pix2[11])); | |
3196 s += abs(pix1[11] - avg2(pix2[11], pix2[12])); | |
3197 s += abs(pix1[12] - avg2(pix2[12], pix2[13])); | |
3198 s += abs(pix1[13] - avg2(pix2[13], pix2[14])); | |
3199 s += abs(pix1[14] - avg2(pix2[14], pix2[15])); | |
3200 s += abs(pix1[15] - avg2(pix2[15], pix2[16])); | |
3201 pix1 += line_size; | |
3202 pix2 += line_size; | |
3203 } | |
3204 return s; | |
3205 } | |
3206 | |
1708 | 3207 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3208 { |
3209 int s, i; | |
1064 | 3210 uint8_t *pix3 = pix2 + line_size; |
0 | 3211 |
3212 s = 0; | |
1708 | 3213 for(i=0;i<h;i++) { |
0 | 3214 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
3215 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
3216 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
3217 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
3218 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
3219 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
3220 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
3221 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
3222 s += abs(pix1[8] - avg2(pix2[8], pix3[8])); | |
3223 s += abs(pix1[9] - avg2(pix2[9], pix3[9])); | |
3224 s += abs(pix1[10] - avg2(pix2[10], pix3[10])); | |
3225 s += abs(pix1[11] - avg2(pix2[11], pix3[11])); | |
3226 s += abs(pix1[12] - avg2(pix2[12], pix3[12])); | |
3227 s += abs(pix1[13] - avg2(pix2[13], pix3[13])); | |
3228 s += abs(pix1[14] - avg2(pix2[14], pix3[14])); | |
3229 s += abs(pix1[15] - avg2(pix2[15], pix3[15])); | |
3230 pix1 += line_size; | |
3231 pix2 += line_size; | |
3232 pix3 += line_size; | |
3233 } | |
3234 return s; | |
3235 } | |
3236 | |
1708 | 3237 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
0 | 3238 { |
3239 int s, i; | |
1064 | 3240 uint8_t *pix3 = pix2 + line_size; |
0 | 3241 |
3242 s = 0; | |
1708 | 3243 for(i=0;i<h;i++) { |
0 | 3244 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
3245 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
3246 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
3247 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
3248 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
3249 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
3250 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
3251 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
3252 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); | |
3253 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); | |
3254 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); | |
3255 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); | |
3256 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); | |
3257 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); | |
3258 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); | |
3259 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); | |
3260 pix1 += line_size; | |
3261 pix2 += line_size; | |
3262 pix3 += line_size; | |
3263 } | |
3264 return s; | |
3265 } | |
3266 | |
1708 | 3267 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3268 { |
3269 int s, i; | |
3270 | |
3271 s = 0; | |
1708 | 3272 for(i=0;i<h;i++) { |
294 | 3273 s += abs(pix1[0] - pix2[0]); |
3274 s += abs(pix1[1] - pix2[1]); | |
3275 s += abs(pix1[2] - pix2[2]); | |
3276 s += abs(pix1[3] - pix2[3]); | |
3277 s += abs(pix1[4] - pix2[4]); | |
3278 s += abs(pix1[5] - pix2[5]); | |
3279 s += abs(pix1[6] - pix2[6]); | |
3280 s += abs(pix1[7] - pix2[7]); | |
3281 pix1 += line_size; | |
3282 pix2 += line_size; | |
3283 } | |
3284 return s; | |
3285 } | |
3286 | |
1708 | 3287 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3288 { |
3289 int s, i; | |
3290 | |
3291 s = 0; | |
1708 | 3292 for(i=0;i<h;i++) { |
294 | 3293 s += abs(pix1[0] - avg2(pix2[0], pix2[1])); |
3294 s += abs(pix1[1] - avg2(pix2[1], pix2[2])); | |
3295 s += abs(pix1[2] - avg2(pix2[2], pix2[3])); | |
3296 s += abs(pix1[3] - avg2(pix2[3], pix2[4])); | |
3297 s += abs(pix1[4] - avg2(pix2[4], pix2[5])); | |
3298 s += abs(pix1[5] - avg2(pix2[5], pix2[6])); | |
3299 s += abs(pix1[6] - avg2(pix2[6], pix2[7])); | |
3300 s += abs(pix1[7] - avg2(pix2[7], pix2[8])); | |
3301 pix1 += line_size; | |
3302 pix2 += line_size; | |
3303 } | |
3304 return s; | |
3305 } | |
3306 | |
1708 | 3307 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3308 { |
3309 int s, i; | |
1064 | 3310 uint8_t *pix3 = pix2 + line_size; |
294 | 3311 |
3312 s = 0; | |
1708 | 3313 for(i=0;i<h;i++) { |
294 | 3314 s += abs(pix1[0] - avg2(pix2[0], pix3[0])); |
3315 s += abs(pix1[1] - avg2(pix2[1], pix3[1])); | |
3316 s += abs(pix1[2] - avg2(pix2[2], pix3[2])); | |
3317 s += abs(pix1[3] - avg2(pix2[3], pix3[3])); | |
3318 s += abs(pix1[4] - avg2(pix2[4], pix3[4])); | |
3319 s += abs(pix1[5] - avg2(pix2[5], pix3[5])); | |
3320 s += abs(pix1[6] - avg2(pix2[6], pix3[6])); | |
3321 s += abs(pix1[7] - avg2(pix2[7], pix3[7])); | |
3322 pix1 += line_size; | |
3323 pix2 += line_size; | |
3324 pix3 += line_size; | |
3325 } | |
3326 return s; | |
3327 } | |
3328 | |
1708 | 3329 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
294 | 3330 { |
3331 int s, i; | |
1064 | 3332 uint8_t *pix3 = pix2 + line_size; |
294 | 3333 |
3334 s = 0; | |
1708 | 3335 for(i=0;i<h;i++) { |
294 | 3336 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); |
3337 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); | |
3338 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); | |
3339 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); | |
3340 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); | |
3341 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); | |
3342 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); | |
3343 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); | |
3344 pix1 += line_size; | |
3345 pix2 += line_size; | |
3346 pix3 += line_size; | |
3347 } | |
3348 return s; | |
3349 } | |
3350 | |
2834 | 3351 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){ |
3352 MpegEncContext *c = v; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3353 int score1=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3354 int score2=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3355 int x,y; |
2066 | 3356 |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3357 for(y=0; y<h; y++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3358 for(x=0; x<16; x++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3359 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3360 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3361 if(y+1<h){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3362 for(x=0; x<15; x++){ |
4001 | 3363 score2+= FFABS( s1[x ] - s1[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3364 - s1[x+1] + s1[x+1+stride]) |
4001 | 3365 -FFABS( s2[x ] - s2[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3366 - s2[x+1] + s2[x+1+stride]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3367 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3368 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3369 s1+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3370 s2+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3371 } |
2066 | 3372 |
4001 | 3373 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
3374 else return score1 + FFABS(score2)*8; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3375 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3376 |
2834 | 3377 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){ |
3378 MpegEncContext *c = v; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3379 int score1=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3380 int score2=0; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3381 int x,y; |
2967 | 3382 |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3383 for(y=0; y<h; y++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3384 for(x=0; x<8; x++){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3385 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3386 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3387 if(y+1<h){ |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3388 for(x=0; x<7; x++){ |
4001 | 3389 score2+= FFABS( s1[x ] - s1[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3390 - s1[x+1] + s1[x+1+stride]) |
4001 | 3391 -FFABS( s2[x ] - s2[x +stride] |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3392 - s2[x+1] + s2[x+1+stride]); |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3393 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3394 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3395 s1+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3396 s2+= stride; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3397 } |
2967 | 3398 |
4001 | 3399 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
3400 else return score1 + FFABS(score2)*8; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3401 } |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3402 |
1784 | 3403 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ |
3404 int i; | |
3405 unsigned int sum=0; | |
3406 | |
3407 for(i=0; i<8*8; i++){ | |
3408 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT)); | |
3409 int w= weight[i]; | |
3410 b>>= RECON_SHIFT; | |
3411 assert(-512<b && b<512); | |
3412 | |
3413 sum += (w*b)*(w*b)>>4; | |
3414 } | |
3415 return sum>>2; | |
3416 } | |
3417 | |
3418 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){ | |
3419 int i; | |
3420 | |
3421 for(i=0; i<8*8; i++){ | |
3422 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); | |
2967 | 3423 } |
1784 | 3424 } |
3425 | |
1100 | 3426 /** |
3427 * permutes an 8x8 block. | |
1101 | 3428 * @param block the block which will be permuted according to the given permutation vector |
1100 | 3429 * @param permutation the permutation vector |
3430 * @param last the last non zero coefficient in scantable order, used to speed the permutation up | |
2967 | 3431 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not |
1101 | 3432 * (inverse) permutated to scantable order! |
1100 | 3433 */ |
1064 | 3434 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last) |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3435 { |
764 | 3436 int i; |
945 | 3437 DCTELEM temp[64]; |
2967 | 3438 |
764 | 3439 if(last<=0) return; |
5129 | 3440 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3441 |
764 | 3442 for(i=0; i<=last; i++){ |
3443 const int j= scantable[i]; | |
3444 temp[j]= block[j]; | |
3445 block[j]=0; | |
3446 } | |
2967 | 3447 |
764 | 3448 for(i=0; i<=last; i++){ |
3449 const int j= scantable[i]; | |
3450 const int perm_j= permutation[j]; | |
3451 block[perm_j]= temp[j]; | |
3452 } | |
174
ac5075a55488
new IDCT code by Michael Niedermayer (michaelni@gmx.at) - #define SIMPLE_IDCT to enable
arpi_esp
parents:
88
diff
changeset
|
3453 } |
34 | 3454 |
1729 | 3455 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){ |
3456 return 0; | |
3457 } | |
3458 | |
3459 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){ | |
3460 int i; | |
2967 | 3461 |
8976
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3462 memset(cmp, 0, sizeof(void*)*6); |
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3463 |
e7d87561b42b
Making the arrays accomodate an extra intra 8x8 cmp function
romansh
parents:
8785
diff
changeset
|
3464 for(i=0; i<6; i++){ |
1729 | 3465 switch(type&0xFF){ |
3466 case FF_CMP_SAD: | |
3467 cmp[i]= c->sad[i]; | |
3468 break; | |
3469 case FF_CMP_SATD: | |
3470 cmp[i]= c->hadamard8_diff[i]; | |
3471 break; | |
3472 case FF_CMP_SSE: | |
3473 cmp[i]= c->sse[i]; | |
3474 break; | |
3475 case FF_CMP_DCT: | |
3476 cmp[i]= c->dct_sad[i]; | |
3477 break; | |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3478 case FF_CMP_DCT264: |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3479 cmp[i]= c->dct264_sad[i]; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3480 break; |
2382 | 3481 case FF_CMP_DCTMAX: |
3482 cmp[i]= c->dct_max[i]; | |
3483 break; | |
1729 | 3484 case FF_CMP_PSNR: |
3485 cmp[i]= c->quant_psnr[i]; | |
3486 break; | |
3487 case FF_CMP_BIT: | |
3488 cmp[i]= c->bit[i]; | |
3489 break; | |
3490 case FF_CMP_RD: | |
3491 cmp[i]= c->rd[i]; | |
3492 break; | |
3493 case FF_CMP_VSAD: | |
3494 cmp[i]= c->vsad[i]; | |
3495 break; | |
3496 case FF_CMP_VSSE: | |
3497 cmp[i]= c->vsse[i]; | |
3498 break; | |
3499 case FF_CMP_ZERO: | |
3500 cmp[i]= zero_cmp; | |
3501 break; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3502 case FF_CMP_NSSE: |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3503 cmp[i]= c->nsse[i]; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
3504 break; |
8590 | 3505 #if CONFIG_SNOW_ENCODER |
2184 | 3506 case FF_CMP_W53: |
3507 cmp[i]= c->w53[i]; | |
3508 break; | |
3509 case FF_CMP_W97: | |
3510 cmp[i]= c->w97[i]; | |
3511 break; | |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
3512 #endif |
1729 | 3513 default: |
3514 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n"); | |
3515 } | |
3516 } | |
3517 } | |
3518 | |
8288 | 3519 static void clear_block_c(DCTELEM *block) |
3520 { | |
3521 memset(block, 0, sizeof(DCTELEM)*64); | |
3522 } | |
3523 | |
1101 | 3524 /** |
3525 * memset(blocks, 0, sizeof(DCTELEM)*6*64) | |
3526 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
3527 static void clear_blocks_c(DCTELEM *blocks) |
296 | 3528 { |
3529 memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
3530 } | |
3531 | |
866 | 3532 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){ |
6385 | 3533 long i; |
3534 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ | |
3535 long a = *(long*)(src+i); | |
3536 long b = *(long*)(dst+i); | |
3537 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); | |
866 | 3538 } |
3539 for(; i<w; i++) | |
3540 dst[i+0] += src[i+0]; | |
3541 } | |
3542 | |
6384 | 3543 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
6385 | 3544 long i; |
6384 | 3545 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ |
3546 long a = *(long*)(src1+i); | |
3547 long b = *(long*)(src2+i); | |
6385 | 3548 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80); |
6384 | 3549 } |
3550 for(; i<w; i++) | |
3551 dst[i] = src1[i]+src2[i]; | |
3552 } | |
3553 | |
866 | 3554 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
6385 | 3555 long i; |
8590 | 3556 #if !HAVE_FAST_UNALIGNED |
6385 | 3557 if((long)src2 & (sizeof(long)-1)){ |
6386 | 3558 for(i=0; i+7<w; i+=8){ |
3559 dst[i+0] = src1[i+0]-src2[i+0]; | |
3560 dst[i+1] = src1[i+1]-src2[i+1]; | |
3561 dst[i+2] = src1[i+2]-src2[i+2]; | |
3562 dst[i+3] = src1[i+3]-src2[i+3]; | |
3563 dst[i+4] = src1[i+4]-src2[i+4]; | |
3564 dst[i+5] = src1[i+5]-src2[i+5]; | |
3565 dst[i+6] = src1[i+6]-src2[i+6]; | |
3566 dst[i+7] = src1[i+7]-src2[i+7]; | |
3567 } | |
6385 | 3568 }else |
3569 #endif | |
3570 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){ | |
3571 long a = *(long*)(src1+i); | |
3572 long b = *(long*)(src2+i); | |
3573 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80); | |
3574 } | |
866 | 3575 for(; i<w; i++) |
3576 dst[i+0] = src1[i+0]-src2[i+0]; | |
3577 } | |
3578 | |
10431 | 3579 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){ |
8760 | 3580 int i; |
3581 uint8_t l, lt; | |
3582 | |
3583 l= *left; | |
3584 lt= *left_top; | |
3585 | |
3586 for(i=0; i<w; i++){ | |
3587 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i]; | |
3588 lt= src1[i]; | |
3589 dst[i]= l; | |
3590 } | |
3591 | |
3592 *left= l; | |
3593 *left_top= lt; | |
3594 } | |
3595 | |
10431 | 3596 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){ |
1527 | 3597 int i; |
3598 uint8_t l, lt; | |
3599 | |
3600 l= *left; | |
3601 lt= *left_top; | |
3602 | |
3603 for(i=0; i<w; i++){ | |
3604 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF); | |
3605 lt= src1[i]; | |
3606 l= src2[i]; | |
3607 dst[i]= l - pred; | |
2967 | 3608 } |
1527 | 3609 |
3610 *left= l; | |
3611 *left_top= lt; | |
3612 } | |
3613 | |
10420
442ab0c41eae
Huffyuv: Add missing const to src pointers in dsputil functions.
astrange
parents:
10370
diff
changeset
|
3614 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){ |
10370 | 3615 int i; |
3616 | |
3617 for(i=0; i<w-1; i++){ | |
3618 acc+= src[i]; | |
3619 dst[i]= acc; | |
3620 i++; | |
3621 acc+= src[i]; | |
3622 dst[i]= acc; | |
3623 } | |
3624 | |
3625 for(; i<w; i++){ | |
3626 acc+= src[i]; | |
3627 dst[i]= acc; | |
3628 } | |
3629 | |
3630 return acc; | |
3631 } | |
3632 | |
3633 #if HAVE_BIGENDIAN | |
3634 #define B 3 | |
3635 #define G 2 | |
3636 #define R 1 | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3637 #define A 0 |
10370 | 3638 #else |
3639 #define B 0 | |
3640 #define G 1 | |
3641 #define R 2 | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3642 #define A 3 |
10370 | 3643 #endif |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3644 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){ |
10370 | 3645 int i; |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3646 int r,g,b,a; |
10370 | 3647 r= *red; |
3648 g= *green; | |
3649 b= *blue; | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3650 a= *alpha; |
10370 | 3651 |
3652 for(i=0; i<w; i++){ | |
3653 b+= src[4*i+B]; | |
3654 g+= src[4*i+G]; | |
3655 r+= src[4*i+R]; | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3656 a+= src[4*i+A]; |
10370 | 3657 |
3658 dst[4*i+B]= b; | |
3659 dst[4*i+G]= g; | |
3660 dst[4*i+R]= r; | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3661 dst[4*i+A]= a; |
10370 | 3662 } |
3663 | |
3664 *red= r; | |
3665 *green= g; | |
3666 *blue= b; | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3667 *alpha= a; |
10370 | 3668 } |
3669 #undef B | |
3670 #undef G | |
3671 #undef R | |
10878
a8620b001ed3
Implement alpha channel decoding for BGR HuffYUV.
astrange
parents:
10867
diff
changeset
|
3672 #undef A |
10370 | 3673 |
936 | 3674 #define BUTTERFLY2(o1,o2,i1,i2) \ |
3675 o1= (i1)+(i2);\ | |
3676 o2= (i1)-(i2); | |
3677 | |
3678 #define BUTTERFLY1(x,y) \ | |
3679 {\ | |
3680 int a,b;\ | |
3681 a= x;\ | |
3682 b= y;\ | |
3683 x= a+b;\ | |
3684 y= a-b;\ | |
3685 } | |
3686 | |
4001 | 3687 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y))) |
936 | 3688 |
1708 | 3689 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ |
936 | 3690 int i; |
3691 int temp[64]; | |
3692 int sum=0; | |
2967 | 3693 |
1708 | 3694 assert(h==8); |
936 | 3695 |
3696 for(i=0; i<8; i++){ | |
3697 //FIXME try pointer walks | |
3698 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]); | |
3699 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]); | |
3700 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]); | |
3701 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]); | |
2967 | 3702 |
936 | 3703 BUTTERFLY1(temp[8*i+0], temp[8*i+2]); |
3704 BUTTERFLY1(temp[8*i+1], temp[8*i+3]); | |
3705 BUTTERFLY1(temp[8*i+4], temp[8*i+6]); | |
3706 BUTTERFLY1(temp[8*i+5], temp[8*i+7]); | |
2967 | 3707 |
936 | 3708 BUTTERFLY1(temp[8*i+0], temp[8*i+4]); |
3709 BUTTERFLY1(temp[8*i+1], temp[8*i+5]); | |
3710 BUTTERFLY1(temp[8*i+2], temp[8*i+6]); | |
3711 BUTTERFLY1(temp[8*i+3], temp[8*i+7]); | |
3712 } | |
3713 | |
3714 for(i=0; i<8; i++){ | |
3715 BUTTERFLY1(temp[8*0+i], temp[8*1+i]); | |
3716 BUTTERFLY1(temp[8*2+i], temp[8*3+i]); | |
3717 BUTTERFLY1(temp[8*4+i], temp[8*5+i]); | |
3718 BUTTERFLY1(temp[8*6+i], temp[8*7+i]); | |
2967 | 3719 |
936 | 3720 BUTTERFLY1(temp[8*0+i], temp[8*2+i]); |
3721 BUTTERFLY1(temp[8*1+i], temp[8*3+i]); | |
3722 BUTTERFLY1(temp[8*4+i], temp[8*6+i]); | |
3723 BUTTERFLY1(temp[8*5+i], temp[8*7+i]); | |
3724 | |
2967 | 3725 sum += |
936 | 3726 BUTTERFLYA(temp[8*0+i], temp[8*4+i]) |
3727 +BUTTERFLYA(temp[8*1+i], temp[8*5+i]) | |
3728 +BUTTERFLYA(temp[8*2+i], temp[8*6+i]) | |
3729 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]); | |
3730 } | |
3731 #if 0 | |
3732 static int maxi=0; | |
3733 if(sum>maxi){ | |
3734 maxi=sum; | |
3735 printf("MAX:%d\n", maxi); | |
3736 } | |
3737 #endif | |
3738 return sum; | |
3739 } | |
3740 | |
1729 | 3741 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){ |
936 | 3742 int i; |
3743 int temp[64]; | |
3744 int sum=0; | |
2967 | 3745 |
1729 | 3746 assert(h==8); |
2967 | 3747 |
936 | 3748 for(i=0; i<8; i++){ |
3749 //FIXME try pointer walks | |
1729 | 3750 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]); |
3751 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]); | |
3752 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]); | |
3753 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]); | |
2967 | 3754 |
936 | 3755 BUTTERFLY1(temp[8*i+0], temp[8*i+2]); |
3756 BUTTERFLY1(temp[8*i+1], temp[8*i+3]); | |
3757 BUTTERFLY1(temp[8*i+4], temp[8*i+6]); | |
3758 BUTTERFLY1(temp[8*i+5], temp[8*i+7]); | |
2967 | 3759 |
936 | 3760 BUTTERFLY1(temp[8*i+0], temp[8*i+4]); |
3761 BUTTERFLY1(temp[8*i+1], temp[8*i+5]); | |
3762 BUTTERFLY1(temp[8*i+2], temp[8*i+6]); | |
3763 BUTTERFLY1(temp[8*i+3], temp[8*i+7]); | |
3764 } | |
3765 | |
3766 for(i=0; i<8; i++){ | |
3767 BUTTERFLY1(temp[8*0+i], temp[8*1+i]); | |
3768 BUTTERFLY1(temp[8*2+i], temp[8*3+i]); | |
3769 BUTTERFLY1(temp[8*4+i], temp[8*5+i]); | |
3770 BUTTERFLY1(temp[8*6+i], temp[8*7+i]); | |
2967 | 3771 |
936 | 3772 BUTTERFLY1(temp[8*0+i], temp[8*2+i]); |
3773 BUTTERFLY1(temp[8*1+i], temp[8*3+i]); | |
3774 BUTTERFLY1(temp[8*4+i], temp[8*6+i]); | |
3775 BUTTERFLY1(temp[8*5+i], temp[8*7+i]); | |
2967 | 3776 |
3777 sum += | |
936 | 3778 BUTTERFLYA(temp[8*0+i], temp[8*4+i]) |
3779 +BUTTERFLYA(temp[8*1+i], temp[8*5+i]) | |
3780 +BUTTERFLYA(temp[8*2+i], temp[8*6+i]) | |
3781 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]); | |
3782 } | |
2967 | 3783 |
4001 | 3784 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean |
2967 | 3785 |
936 | 3786 return sum; |
3787 } | |
3788 | |
1708 | 3789 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 3790 MpegEncContext * const s= (MpegEncContext *)c; |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10941
diff
changeset
|
3791 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8]; |
1016 | 3792 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
2967 | 3793 |
1708 | 3794 assert(h==8); |
936 | 3795 |
3796 s->dsp.diff_pixels(temp, src1, src2, stride); | |
1092 | 3797 s->dsp.fdct(temp); |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
3798 return s->dsp.sum_abs_dctelem(temp); |
936 | 3799 } |
3800 | |
8590 | 3801 #if CONFIG_GPL |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3802 #define DCT8_1D {\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3803 const int s07 = SRC(0) + SRC(7);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3804 const int s16 = SRC(1) + SRC(6);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3805 const int s25 = SRC(2) + SRC(5);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3806 const int s34 = SRC(3) + SRC(4);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3807 const int a0 = s07 + s34;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3808 const int a1 = s16 + s25;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3809 const int a2 = s07 - s34;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3810 const int a3 = s16 - s25;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3811 const int d07 = SRC(0) - SRC(7);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3812 const int d16 = SRC(1) - SRC(6);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3813 const int d25 = SRC(2) - SRC(5);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3814 const int d34 = SRC(3) - SRC(4);\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3815 const int a4 = d16 + d25 + (d07 + (d07>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3816 const int a5 = d07 - d34 - (d25 + (d25>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3817 const int a6 = d07 + d34 - (d16 + (d16>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3818 const int a7 = d16 - d25 + (d34 + (d34>>1));\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3819 DST(0, a0 + a1 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3820 DST(1, a4 + (a7>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3821 DST(2, a2 + (a3>>1)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3822 DST(3, a5 + (a6>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3823 DST(4, a0 - a1 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3824 DST(5, a6 - (a5>>2)) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3825 DST(6, (a2>>1) - a3 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3826 DST(7, (a4>>2) - a7 ) ;\ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3827 } |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3828 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3829 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3830 MpegEncContext * const s= (MpegEncContext *)c; |
5256 | 3831 DCTELEM dct[8][8]; |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3832 int i; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3833 int sum=0; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3834 |
5256 | 3835 s->dsp.diff_pixels(dct[0], src1, src2, stride); |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3836 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3837 #define SRC(x) dct[i][x] |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3838 #define DST(x,v) dct[i][x]= v |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3839 for( i = 0; i < 8; i++ ) |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3840 DCT8_1D |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3841 #undef SRC |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3842 #undef DST |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3843 |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3844 #define SRC(x) dct[x][i] |
4001 | 3845 #define DST(x,v) sum += FFABS(v) |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3846 for( i = 0; i < 8; i++ ) |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3847 DCT8_1D |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3848 #undef SRC |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3849 #undef DST |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3850 return sum; |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3851 } |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3852 #endif |
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
3853 |
2382 | 3854 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
3855 MpegEncContext * const s= (MpegEncContext *)c; | |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10941
diff
changeset
|
3856 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8]; |
2382 | 3857 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3858 int sum=0, i; | |
2967 | 3859 |
2382 | 3860 assert(h==8); |
3861 | |
3862 s->dsp.diff_pixels(temp, src1, src2, stride); | |
3863 s->dsp.fdct(temp); | |
3864 | |
3865 for(i=0; i<64; i++) | |
4001 | 3866 sum= FFMAX(sum, FFABS(temp[i])); |
2967 | 3867 |
2382 | 3868 return sum; |
3869 } | |
3870 | |
1708 | 3871 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
936 | 3872 MpegEncContext * const s= (MpegEncContext *)c; |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10941
diff
changeset
|
3873 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64*2/8]; |
1016 | 3874 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
3875 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64; | |
936 | 3876 int sum=0, i; |
3877 | |
1708 | 3878 assert(h==8); |
936 | 3879 s->mb_intra=0; |
2967 | 3880 |
936 | 3881 s->dsp.diff_pixels(temp, src1, src2, stride); |
2967 | 3882 |
936 | 3883 memcpy(bak, temp, 64*sizeof(DCTELEM)); |
2967 | 3884 |
1013 | 3885 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
1689 | 3886 s->dct_unquantize_inter(s, temp, 0, s->qscale); |
6001 | 3887 ff_simple_idct(temp); //FIXME |
2967 | 3888 |
936 | 3889 for(i=0; i<64; i++) |
3890 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]); | |
2967 | 3891 |
936 | 3892 return sum; |
3893 } | |
3894 | |
1708 | 3895 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1007 | 3896 MpegEncContext * const s= (MpegEncContext *)c; |
1064 | 3897 const uint8_t *scantable= s->intra_scantable.permutated; |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10941
diff
changeset
|
3898 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8]; |
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10941
diff
changeset
|
3899 DECLARE_ALIGNED_16(uint64_t, aligned_src1)[8]; |
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10941
diff
changeset
|
3900 DECLARE_ALIGNED_16(uint64_t, aligned_src2)[8]; |
1016 | 3901 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
10068 | 3902 uint8_t * const lsrc1 = (uint8_t*)aligned_src1; |
3903 uint8_t * const lsrc2 = (uint8_t*)aligned_src2; | |
6719 | 3904 int i, last, run, bits, level, distortion, start_i; |
1007 | 3905 const int esc_length= s->ac_esc_length; |
3906 uint8_t * length; | |
3907 uint8_t * last_length; | |
2967 | 3908 |
1708 | 3909 assert(h==8); |
3910 | |
10068 | 3911 copy_block8(lsrc1, src1, 8, stride, 8); |
3912 copy_block8(lsrc2, src2, 8, stride, 8); | |
3913 | |
3914 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8); | |
1007 | 3915 |
1013 | 3916 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
3917 | |
3918 bits=0; | |
2967 | 3919 |
1013 | 3920 if (s->mb_intra) { |
2967 | 3921 start_i = 1; |
1013 | 3922 length = s->intra_ac_vlc_length; |
3923 last_length= s->intra_ac_vlc_last_length; | |
3924 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma | |
3925 } else { | |
3926 start_i = 0; | |
3927 length = s->inter_ac_vlc_length; | |
3928 last_length= s->inter_ac_vlc_last_length; | |
3929 } | |
2967 | 3930 |
1013 | 3931 if(last>=start_i){ |
1007 | 3932 run=0; |
3933 for(i=start_i; i<last; i++){ | |
3934 int j= scantable[i]; | |
3935 level= temp[j]; | |
2967 | 3936 |
1007 | 3937 if(level){ |
3938 level+=64; | |
3939 if((level&(~127)) == 0){ | |
3940 bits+= length[UNI_AC_ENC_INDEX(run, level)]; | |
3941 }else | |
3942 bits+= esc_length; | |
3943 run=0; | |
3944 }else | |
3945 run++; | |
3946 } | |
3947 i= scantable[last]; | |
2967 | 3948 |
1011 | 3949 level= temp[i] + 64; |
3950 | |
3951 assert(level - 64); | |
2967 | 3952 |
1007 | 3953 if((level&(~127)) == 0){ |
3954 bits+= last_length[UNI_AC_ENC_INDEX(run, level)]; | |
3955 }else | |
3956 bits+= esc_length; | |
2967 | 3957 |
1013 | 3958 } |
3959 | |
3960 if(last>=0){ | |
1689 | 3961 if(s->mb_intra) |
3962 s->dct_unquantize_intra(s, temp, 0, s->qscale); | |
3963 else | |
3964 s->dct_unquantize_inter(s, temp, 0, s->qscale); | |
1007 | 3965 } |
2967 | 3966 |
10068 | 3967 s->dsp.idct_add(lsrc2, 8, temp); |
3968 | |
3969 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8); | |
6719 | 3970 |
3971 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7); | |
1007 | 3972 } |
3973 | |
1708 | 3974 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){ |
1007 | 3975 MpegEncContext * const s= (MpegEncContext *)c; |
1064 | 3976 const uint8_t *scantable= s->intra_scantable.permutated; |
10961
34a65026fa06
Move array specifiers outside DECLARE_ALIGNED() invocations
mru
parents:
10941
diff
changeset
|
3977 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8]; |
1016 | 3978 DCTELEM * const temp= (DCTELEM*)aligned_temp; |
1007 | 3979 int i, last, run, bits, level, start_i; |
3980 const int esc_length= s->ac_esc_length; | |
3981 uint8_t * length; | |
3982 uint8_t * last_length; | |
1708 | 3983 |
3984 assert(h==8); | |
2967 | 3985 |
1013 | 3986 s->dsp.diff_pixels(temp, src1, src2, stride); |
1007 | 3987 |
1013 | 3988 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i); |
3989 | |
3990 bits=0; | |
2967 | 3991 |
1007 | 3992 if (s->mb_intra) { |
2967 | 3993 start_i = 1; |
1007 | 3994 length = s->intra_ac_vlc_length; |
3995 last_length= s->intra_ac_vlc_last_length; | |
1013 | 3996 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma |
1007 | 3997 } else { |
3998 start_i = 0; | |
3999 length = s->inter_ac_vlc_length; | |
4000 last_length= s->inter_ac_vlc_last_length; | |
4001 } | |
2967 | 4002 |
1013 | 4003 if(last>=start_i){ |
1007 | 4004 run=0; |
4005 for(i=start_i; i<last; i++){ | |
4006 int j= scantable[i]; | |
4007 level= temp[j]; | |
2967 | 4008 |
1007 | 4009 if(level){ |
4010 level+=64; | |
4011 if((level&(~127)) == 0){ | |
4012 bits+= length[UNI_AC_ENC_INDEX(run, level)]; | |
4013 }else | |
4014 bits+= esc_length; | |
4015 run=0; | |
4016 }else | |
4017 run++; | |
4018 } | |
4019 i= scantable[last]; | |
2967 | 4020 |
1013 | 4021 level= temp[i] + 64; |
2967 | 4022 |
1013 | 4023 assert(level - 64); |
2967 | 4024 |
1007 | 4025 if((level&(~127)) == 0){ |
4026 bits+= last_length[UNI_AC_ENC_INDEX(run, level)]; | |
4027 }else | |
4028 bits+= esc_length; | |
4029 } | |
4030 | |
4031 return bits; | |
4032 } | |
4033 | |
8978 | 4034 #define VSAD_INTRA(size) \ |
4035 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \ | |
4036 int score=0; \ | |
4037 int x,y; \ | |
4038 \ | |
4039 for(y=1; y<h; y++){ \ | |
4040 for(x=0; x<size; x+=4){ \ | |
4041 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \ | |
4042 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \ | |
4043 } \ | |
4044 s+= stride; \ | |
4045 } \ | |
4046 \ | |
4047 return score; \ | |
1729 | 4048 } |
8978 | 4049 VSAD_INTRA(8) |
4050 VSAD_INTRA(16) | |
1729 | 4051 |
4052 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){ | |
4053 int score=0; | |
4054 int x,y; | |
2967 | 4055 |
1729 | 4056 for(y=1; y<h; y++){ |
4057 for(x=0; x<16; x++){ | |
4001 | 4058 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]); |
1729 | 4059 } |
4060 s1+= stride; | |
4061 s2+= stride; | |
4062 } | |
2967 | 4063 |
1729 | 4064 return score; |
4065 } | |
4066 | |
4067 #define SQ(a) ((a)*(a)) | |
8978 | 4068 #define VSSE_INTRA(size) \ |
4069 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \ | |
4070 int score=0; \ | |
4071 int x,y; \ | |
4072 \ | |
4073 for(y=1; y<h; y++){ \ | |
4074 for(x=0; x<size; x+=4){ \ | |
4075 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \ | |
4076 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \ | |
4077 } \ | |
4078 s+= stride; \ | |
4079 } \ | |
4080 \ | |
4081 return score; \ | |
1729 | 4082 } |
8978 | 4083 VSSE_INTRA(8) |
4084 VSSE_INTRA(16) | |
1729 | 4085 |
4086 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){ | |
4087 int score=0; | |
4088 int x,y; | |
2967 | 4089 |
1729 | 4090 for(y=1; y<h; y++){ |
4091 for(x=0; x<16; x++){ | |
4092 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]); | |
4093 } | |
4094 s1+= stride; | |
4095 s2+= stride; | |
4096 } | |
2967 | 4097 |
1729 | 4098 return score; |
4099 } | |
4100 | |
5255 | 4101 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2, |
4102 int size){ | |
4749 | 4103 int score=0; |
4104 int i; | |
4105 for(i=0; i<size; i++) | |
4106 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); | |
4107 return score; | |
4108 } | |
4109 | |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4110 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4111 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4112 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) |
8590 | 4113 #if CONFIG_GPL |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4114 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) |
3013 | 4115 #endif |
6056
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4116 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4117 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4118 WRAPPER8_16_SQ(rd8x8_c, rd16_c) |
558c1fd0ee72
Fix typo in macro name: WARPER8_16_SQ --> WRAPPER8_16_SQ.
diego
parents:
6054
diff
changeset
|
4119 WRAPPER8_16_SQ(bit8x8_c, bit16_c) |
936 | 4120 |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4121 static void vector_fmul_c(float *dst, const float *src, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4122 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4123 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4124 dst[i] *= src[i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4125 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4126 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4127 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4128 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4129 src1 += len-1; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4130 for(i=0; i<len; i++) |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4131 dst[i] = src0[i] * src1[-i]; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4132 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4133 |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4134 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4135 int i; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4136 for(i=0; i<len; i++) |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4137 dst[i] = src0[i] * src1[i] + src2[i]; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4138 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4139 |
7261 | 4140 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){ |
7263 | 4141 int i,j; |
4142 dst += len; | |
4143 win += len; | |
4144 src0+= len; | |
4145 for(i=-len, j=len-1; i<0; i++, j--) { | |
4146 float s0 = src0[i]; | |
4147 float s1 = src1[j]; | |
4148 float wi = win[i]; | |
4149 float wj = win[j]; | |
4150 dst[i] = s0*wj - s1*wi + add_bias; | |
4151 dst[j] = s0*wi + s1*wj + add_bias; | |
4152 } | |
7261 | 4153 } |
4154 | |
10219 | 4155 static void vector_fmul_scalar_c(float *dst, const float *src, float mul, |
4156 int len) | |
4157 { | |
4158 int i; | |
4159 for (i = 0; i < len; i++) | |
4160 dst[i] = src[i] * mul; | |
4161 } | |
4162 | |
4163 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src, | |
4164 const float **sv, float mul, int len) | |
4165 { | |
4166 int i; | |
4167 for (i = 0; i < len; i += 2, sv++) { | |
4168 dst[i ] = src[i ] * sv[0][0] * mul; | |
4169 dst[i+1] = src[i+1] * sv[0][1] * mul; | |
4170 } | |
4171 } | |
4172 | |
4173 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src, | |
4174 const float **sv, float mul, int len) | |
4175 { | |
4176 int i; | |
4177 for (i = 0; i < len; i += 4, sv++) { | |
4178 dst[i ] = src[i ] * sv[0][0] * mul; | |
4179 dst[i+1] = src[i+1] * sv[0][1] * mul; | |
4180 dst[i+2] = src[i+2] * sv[0][2] * mul; | |
4181 dst[i+3] = src[i+3] * sv[0][3] * mul; | |
4182 } | |
4183 } | |
4184 | |
4185 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul, | |
4186 int len) | |
4187 { | |
4188 int i; | |
4189 for (i = 0; i < len; i += 2, sv++) { | |
4190 dst[i ] = sv[0][0] * mul; | |
4191 dst[i+1] = sv[0][1] * mul; | |
4192 } | |
4193 } | |
4194 | |
4195 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul, | |
4196 int len) | |
4197 { | |
4198 int i; | |
4199 for (i = 0; i < len; i += 4, sv++) { | |
4200 dst[i ] = sv[0][0] * mul; | |
4201 dst[i+1] = sv[0][1] * mul; | |
4202 dst[i+2] = sv[0][2] * mul; | |
4203 dst[i+3] = sv[0][3] * mul; | |
4204 } | |
4205 } | |
4206 | |
4207 static void butterflies_float_c(float *restrict v1, float *restrict v2, | |
4208 int len) | |
4209 { | |
4210 int i; | |
4211 for (i = 0; i < len; i++) { | |
4212 float t = v1[i] - v2[i]; | |
4213 v1[i] += v2[i]; | |
4214 v2[i] = t; | |
4215 } | |
4216 } | |
4217 | |
4218 static float scalarproduct_float_c(const float *v1, const float *v2, int len) | |
4219 { | |
4220 float p = 0.0; | |
4221 int i; | |
4222 | |
4223 for (i = 0; i < len; i++) | |
4224 p += v1[i] * v2[i]; | |
4225 | |
4226 return p; | |
4227 } | |
4228 | |
7564 | 4229 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ |
4230 int i; | |
4231 for(i=0; i<len; i++) | |
4232 dst[i] = src[i] * mul; | |
4233 } | |
4234 | |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4235 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini, |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4236 uint32_t maxi, uint32_t maxisign) |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4237 { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4238 |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4239 if(a > mini) return mini; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4240 else if((a^(1<<31)) > maxisign) return maxi; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4241 else return a; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4242 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4243 |
10105 | 4244 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){ |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4245 int i; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4246 uint32_t mini = *(uint32_t*)min; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4247 uint32_t maxi = *(uint32_t*)max; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4248 uint32_t maxisign = maxi ^ (1<<31); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4249 uint32_t *dsti = (uint32_t*)dst; |
10105 | 4250 const uint32_t *srci = (const uint32_t*)src; |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4251 for(i=0; i<len; i+=8) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4252 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4253 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4254 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4255 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4256 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4257 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4258 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4259 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4260 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4261 } |
10105 | 4262 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){ |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4263 int i; |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4264 if(min < 0 && max > 0) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4265 vector_clipf_c_opposite_sign(dst, src, &min, &max, len); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4266 } else { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4267 for(i=0; i < len; i+=8) { |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4268 dst[i ] = av_clipf(src[i ], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4269 dst[i + 1] = av_clipf(src[i + 1], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4270 dst[i + 2] = av_clipf(src[i + 2], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4271 dst[i + 3] = av_clipf(src[i + 3], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4272 dst[i + 4] = av_clipf(src[i + 4], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4273 dst[i + 5] = av_clipf(src[i + 5], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4274 dst[i + 6] = av_clipf(src[i + 6], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4275 dst[i + 7] = av_clipf(src[i + 7], min, max); |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4276 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4277 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4278 } |
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4279 |
7261 | 4280 static av_always_inline int float_to_int16_one(const float *src){ |
4281 int_fast32_t tmp = *(const int32_t*)src; | |
4282 if(tmp & 0xf0000){ | |
4283 tmp = (0x43c0ffff - tmp)>>31; | |
4284 // is this faster on some gcc/cpu combinations? | |
4285 // if(tmp > 0x43c0ffff) tmp = 0xFFFF; | |
4286 // else tmp = 0; | |
4287 } | |
4288 return tmp - 0x8000; | |
4289 } | |
4290 | |
7218 | 4291 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4292 int i; |
7261 | 4293 for(i=0; i<len; i++) |
4294 dst[i] = float_to_int16_one(src+i); | |
4295 } | |
4296 | |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4297 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){ |
7261 | 4298 int i,j,c; |
4299 if(channels==2){ | |
4300 for(i=0; i<len; i++){ | |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4301 dst[2*i] = float_to_int16_one(src[0]+i); |
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4302 dst[2*i+1] = float_to_int16_one(src[1]+i); |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4303 } |
7261 | 4304 }else{ |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4305 for(c=0; c<channels; c++) |
7261 | 4306 for(i=0, j=c; i<len; i++, j+=channels) |
7286
e267f2519248
float_to_int16_interleave: change src to an array of pointers instead of assuming it's contiguous.
lorenm
parents:
7263
diff
changeset
|
4307 dst[j] = float_to_int16_one(src[c]+i); |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4308 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4309 } |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4310 |
7203
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4311 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4312 { |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4313 int res = 0; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4314 |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4315 while (order--) |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4316 res += (*v1++ * *v2++) >> shift; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4317 |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4318 return res; |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4319 } |
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4320 |
10644 | 4321 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
4322 { | |
4323 int res = 0; | |
4324 while (order--) { | |
4325 res += *v1 * *v2++; | |
4326 *v1++ += mul * *v3++; | |
4327 } | |
4328 return res; | |
4329 } | |
4330 | |
5887 | 4331 #define W0 2048 |
4332 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */ | |
4333 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */ | |
4334 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */ | |
4335 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */ | |
4336 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */ | |
4337 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */ | |
4338 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */ | |
4339 | |
4340 static void wmv2_idct_row(short * b) | |
4341 { | |
4342 int s1,s2; | |
4343 int a0,a1,a2,a3,a4,a5,a6,a7; | |
4344 /*step 1*/ | |
4345 a1 = W1*b[1]+W7*b[7]; | |
4346 a7 = W7*b[1]-W1*b[7]; | |
4347 a5 = W5*b[5]+W3*b[3]; | |
4348 a3 = W3*b[5]-W5*b[3]; | |
4349 a2 = W2*b[2]+W6*b[6]; | |
4350 a6 = W6*b[2]-W2*b[6]; | |
4351 a0 = W0*b[0]+W0*b[4]; | |
4352 a4 = W0*b[0]-W0*b[4]; | |
4353 /*step 2*/ | |
4354 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7, | |
4355 s2 = (181*(a1-a5-a7+a3)+128)>>8; | |
4356 /*step 3*/ | |
4357 b[0] = (a0+a2+a1+a5 + (1<<7))>>8; | |
4358 b[1] = (a4+a6 +s1 + (1<<7))>>8; | |
4359 b[2] = (a4-a6 +s2 + (1<<7))>>8; | |
4360 b[3] = (a0-a2+a7+a3 + (1<<7))>>8; | |
4361 b[4] = (a0-a2-a7-a3 + (1<<7))>>8; | |
4362 b[5] = (a4-a6 -s2 + (1<<7))>>8; | |
4363 b[6] = (a4+a6 -s1 + (1<<7))>>8; | |
4364 b[7] = (a0+a2-a1-a5 + (1<<7))>>8; | |
4365 } | |
4366 static void wmv2_idct_col(short * b) | |
4367 { | |
4368 int s1,s2; | |
4369 int a0,a1,a2,a3,a4,a5,a6,a7; | |
4370 /*step 1, with extended precision*/ | |
4371 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3; | |
4372 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3; | |
4373 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3; | |
4374 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3; | |
4375 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3; | |
4376 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3; | |
4377 a0 = (W0*b[8*0]+W0*b[8*4] )>>3; | |
4378 a4 = (W0*b[8*0]-W0*b[8*4] )>>3; | |
4379 /*step 2*/ | |
4380 s1 = (181*(a1-a5+a7-a3)+128)>>8; | |
4381 s2 = (181*(a1-a5-a7+a3)+128)>>8; | |
4382 /*step 3*/ | |
4383 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14; | |
4384 b[8*1] = (a4+a6 +s1 + (1<<13))>>14; | |
4385 b[8*2] = (a4-a6 +s2 + (1<<13))>>14; | |
4386 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14; | |
4387 | |
4388 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14; | |
4389 b[8*5] = (a4-a6 -s2 + (1<<13))>>14; | |
4390 b[8*6] = (a4+a6 -s1 + (1<<13))>>14; | |
4391 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14; | |
4392 } | |
4393 void ff_wmv2_idct_c(short * block){ | |
4394 int i; | |
4395 | |
4396 for(i=0;i<64;i+=8){ | |
4397 wmv2_idct_row(block+i); | |
4398 } | |
4399 for(i=0;i<8;i++){ | |
4400 wmv2_idct_col(block+i); | |
4401 } | |
4402 } | |
1092 | 4403 /* XXX: those functions should be suppressed ASAP when all IDCTs are |
4404 converted */ | |
5887 | 4405 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block) |
4406 { | |
4407 ff_wmv2_idct_c(block); | |
4408 put_pixels_clamped_c(block, dest, line_size); | |
4409 } | |
4410 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block) | |
4411 { | |
4412 ff_wmv2_idct_c(block); | |
4413 add_pixels_clamped_c(block, dest, line_size); | |
4414 } | |
1092 | 4415 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
4416 { | |
4417 j_rev_dct (block); | |
4418 put_pixels_clamped_c(block, dest, line_size); | |
4419 } | |
4420 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4421 { | |
4422 j_rev_dct (block); | |
4423 add_pixels_clamped_c(block, dest, line_size); | |
4424 } | |
4425 | |
2256 | 4426 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block) |
4427 { | |
4428 j_rev_dct4 (block); | |
4429 put_pixels_clamped4_c(block, dest, line_size); | |
4430 } | |
4431 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4432 { | |
4433 j_rev_dct4 (block); | |
4434 add_pixels_clamped4_c(block, dest, line_size); | |
4435 } | |
4436 | |
2257 | 4437 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block) |
4438 { | |
4439 j_rev_dct2 (block); | |
4440 put_pixels_clamped2_c(block, dest, line_size); | |
4441 } | |
4442 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4443 { | |
4444 j_rev_dct2 (block); | |
4445 add_pixels_clamped2_c(block, dest, line_size); | |
4446 } | |
4447 | |
2259 | 4448 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block) |
4449 { | |
4176 | 4450 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2259 | 4451 |
4452 dest[0] = cm[(block[0] + 4)>>3]; | |
4453 } | |
4454 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block) | |
4455 { | |
4176 | 4456 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; |
2259 | 4457 |
4458 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)]; | |
4459 } | |
4460 | |
5143 | 4461 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; } |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4462 |
1201 | 4463 /* init static data */ |
10867 | 4464 av_cold void dsputil_static_init(void) |
0 | 4465 { |
751 | 4466 int i; |
0 | 4467 |
4176 | 4468 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i; |
1201 | 4469 for(i=0;i<MAX_NEG_CROP;i++) { |
4176 | 4470 ff_cropTbl[i] = 0; |
4471 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255; | |
1201 | 4472 } |
2967 | 4473 |
1201 | 4474 for(i=0;i<512;i++) { |
4179 | 4475 ff_squareTbl[i] = (i - 256) * (i - 256); |
1201 | 4476 } |
2967 | 4477 |
4197 | 4478 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1; |
1201 | 4479 } |
0 | 4480 |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4481 int ff_check_alignment(void){ |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4482 static int did_fail=0; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4483 DECLARE_ALIGNED_16(int, aligned); |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4484 |
9259 | 4485 if((intptr_t)&aligned & 15){ |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4486 if(!did_fail){ |
8590 | 4487 #if HAVE_MMX || HAVE_ALTIVEC |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4488 av_log(NULL, AV_LOG_ERROR, |
4292 | 4489 "Compiler did not align stack variables. Libavcodec has been miscompiled\n" |
4490 "and may be very slow or crash. This is not a bug in libavcodec,\n" | |
5542
b0a566346fb1
Add attribute that forces alignment of stack to functions that need it.
ramiro
parents:
5520
diff
changeset
|
4491 "but in the compiler. You may try recompiling using gcc >= 4.2.\n" |
b0a566346fb1
Add attribute that forces alignment of stack to functions that need it.
ramiro
parents:
5520
diff
changeset
|
4492 "Do not report crashes to FFmpeg developers.\n"); |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4493 #endif |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4494 did_fail=1; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4495 } |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4496 return -1; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4497 } |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4498 return 0; |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4499 } |
861 | 4500 |
10867 | 4501 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) |
1201 | 4502 { |
4503 int i; | |
0 | 4504 |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4505 ff_check_alignment(); |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4240
diff
changeset
|
4506 |
8590 | 4507 #if CONFIG_ENCODERS |
1567 | 4508 if(avctx->dct_algo==FF_DCT_FASTINT) { |
1092 | 4509 c->fdct = fdct_ifast; |
2979 | 4510 c->fdct248 = fdct_ifast248; |
2967 | 4511 } |
1567 | 4512 else if(avctx->dct_algo==FF_DCT_FAAN) { |
1557 | 4513 c->fdct = ff_faandct; |
2979 | 4514 c->fdct248 = ff_faandct248; |
2967 | 4515 } |
1567 | 4516 else { |
1092 | 4517 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default |
2979 | 4518 c->fdct248 = ff_fdct248_islow; |
1567 | 4519 } |
1092 | 4520 #endif //CONFIG_ENCODERS |
4521 | |
2256 | 4522 if(avctx->lowres==1){ |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4523 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){ |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4524 c->idct_put= ff_jref_idct4_put; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4525 c->idct_add= ff_jref_idct4_add; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4526 }else{ |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4527 c->idct_put= ff_h264_lowres_idct_put_c; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4528 c->idct_add= ff_h264_lowres_idct_add_c; |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4529 } |
2256 | 4530 c->idct = j_rev_dct4; |
1092 | 4531 c->idct_permutation_type= FF_NO_IDCT_PERM; |
2257 | 4532 }else if(avctx->lowres==2){ |
4533 c->idct_put= ff_jref_idct2_put; | |
4534 c->idct_add= ff_jref_idct2_add; | |
4535 c->idct = j_rev_dct2; | |
4536 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2259 | 4537 }else if(avctx->lowres==3){ |
4538 c->idct_put= ff_jref_idct1_put; | |
4539 c->idct_add= ff_jref_idct1_add; | |
4540 c->idct = j_rev_dct1; | |
4541 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2256 | 4542 }else{ |
4543 if(avctx->idct_algo==FF_IDCT_INT){ | |
4544 c->idct_put= ff_jref_idct_put; | |
4545 c->idct_add= ff_jref_idct_add; | |
4546 c->idct = j_rev_dct; | |
4547 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; | |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9586
diff
changeset
|
4548 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) && |
5007 | 4549 avctx->idct_algo==FF_IDCT_VP3){ |
2693 | 4550 c->idct_put= ff_vp3_idct_put_c; |
4551 c->idct_add= ff_vp3_idct_add_c; | |
4552 c->idct = ff_vp3_idct_c; | |
4553 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
5887 | 4554 }else if(avctx->idct_algo==FF_IDCT_WMV2){ |
4555 c->idct_put= ff_wmv2_idct_put_c; | |
4556 c->idct_add= ff_wmv2_idct_add_c; | |
4557 c->idct = ff_wmv2_idct_c; | |
4558 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
6407 | 4559 }else if(avctx->idct_algo==FF_IDCT_FAAN){ |
4560 c->idct_put= ff_faanidct_put; | |
4561 c->idct_add= ff_faanidct_add; | |
4562 c->idct = ff_faanidct; | |
4563 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4564 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) { |
8120 | 4565 c->idct_put= ff_ea_idct_put_c; |
4566 c->idct_permutation_type= FF_NO_IDCT_PERM; | |
2256 | 4567 }else{ //accurate/default |
6001 | 4568 c->idct_put= ff_simple_idct_put; |
4569 c->idct_add= ff_simple_idct_add; | |
4570 c->idct = ff_simple_idct; | |
2256 | 4571 c->idct_permutation_type= FF_NO_IDCT_PERM; |
4572 } | |
1092 | 4573 } |
4574 | |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4575 if (CONFIG_H264_DECODER) { |
5065 | 4576 c->h264_idct_add= ff_h264_idct_add_c; |
4577 c->h264_idct8_add= ff_h264_idct8_add_c; | |
4578 c->h264_idct_dc_add= ff_h264_idct_dc_add_c; | |
4579 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; | |
8375
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4580 c->h264_idct_add16 = ff_h264_idct_add16_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4581 c->h264_idct8_add4 = ff_h264_idct8_add4_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4582 c->h264_idct_add8 = ff_h264_idct_add8_c; |
de2509cf3c44
H.264 idct functions that include the chroma, inter luma and intra16 luma loops
michael
parents:
8359
diff
changeset
|
4583 c->h264_idct_add16intra= ff_h264_idct_add16intra_c; |
5064 | 4584 } |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
4585 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4586 c->get_pixels = get_pixels_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4587 c->diff_pixels = diff_pixels_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4588 c->put_pixels_clamped = put_pixels_clamped_c; |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
4589 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4590 c->add_pixels_clamped = add_pixels_clamped_c; |
2763 | 4591 c->add_pixels8 = add_pixels8_c; |
4592 c->add_pixels4 = add_pixels4_c; | |
4988
689490842cf5
factor sum_abs_dctelem out of dct_sad, and simd it.
lorenm
parents:
4749
diff
changeset
|
4593 c->sum_abs_dctelem = sum_abs_dctelem_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4594 c->gmc1 = gmc1_c; |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
4595 c->gmc = ff_gmc_c; |
8288 | 4596 c->clear_block = clear_block_c; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4597 c->clear_blocks = clear_blocks_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4598 c->pix_sum = pix_sum_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4599 c->pix_norm1 = pix_norm1_c; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4600 |
859 | 4601 /* TODO [0] 16 [1] 8 */ |
1708 | 4602 c->pix_abs[0][0] = pix_abs16_c; |
4603 c->pix_abs[0][1] = pix_abs16_x2_c; | |
4604 c->pix_abs[0][2] = pix_abs16_y2_c; | |
4605 c->pix_abs[0][3] = pix_abs16_xy2_c; | |
4606 c->pix_abs[1][0] = pix_abs8_c; | |
4607 c->pix_abs[1][1] = pix_abs8_x2_c; | |
4608 c->pix_abs[1][2] = pix_abs8_y2_c; | |
4609 c->pix_abs[1][3] = pix_abs8_xy2_c; | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4610 |
859 | 4611 #define dspfunc(PFX, IDX, NUM) \ |
4612 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \ | |
4613 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \ | |
4614 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \ | |
4615 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
764
diff
changeset
|
4616 |
859 | 4617 dspfunc(put, 0, 16); |
4618 dspfunc(put_no_rnd, 0, 16); | |
4619 dspfunc(put, 1, 8); | |
4620 dspfunc(put_no_rnd, 1, 8); | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4621 dspfunc(put, 2, 4); |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4622 dspfunc(put, 3, 2); |
0 | 4623 |
859 | 4624 dspfunc(avg, 0, 16); |
4625 dspfunc(avg_no_rnd, 0, 16); | |
4626 dspfunc(avg, 1, 8); | |
4627 dspfunc(avg_no_rnd, 1, 8); | |
1319 | 4628 dspfunc(avg, 2, 4); |
4629 dspfunc(avg, 3, 2); | |
859 | 4630 #undef dspfunc |
857 | 4631 |
1864 | 4632 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c; |
4633 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c; | |
4634 | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4635 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4636 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4637 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4638 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4639 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4640 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4641 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4642 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4643 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c; |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
4644 |
1319 | 4645 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c; |
4646 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c; | |
4647 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c; | |
4648 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c; | |
4649 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c; | |
4650 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c; | |
4651 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c; | |
4652 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c; | |
4653 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c; | |
4654 | |
859 | 4655 #define dspfunc(PFX, IDX, NUM) \ |
4656 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \ | |
4657 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \ | |
4658 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \ | |
4659 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \ | |
4660 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \ | |
4661 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \ | |
4662 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \ | |
4663 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \ | |
4664 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \ | |
4665 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \ | |
4666 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \ | |
4667 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \ | |
4668 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \ | |
4669 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \ | |
4670 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \ | |
4671 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c | |
857 | 4672 |
859 | 4673 dspfunc(put_qpel, 0, 16); |
4674 dspfunc(put_no_rnd_qpel, 0, 16); | |
4675 | |
4676 dspfunc(avg_qpel, 0, 16); | |
4677 /* dspfunc(avg_no_rnd_qpel, 0, 16); */ | |
857 | 4678 |
859 | 4679 dspfunc(put_qpel, 1, 8); |
4680 dspfunc(put_no_rnd_qpel, 1, 8); | |
4681 | |
4682 dspfunc(avg_qpel, 1, 8); | |
4683 /* dspfunc(avg_no_rnd_qpel, 1, 8); */ | |
1168 | 4684 |
4685 dspfunc(put_h264_qpel, 0, 16); | |
4686 dspfunc(put_h264_qpel, 1, 8); | |
4687 dspfunc(put_h264_qpel, 2, 4); | |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3013
diff
changeset
|
4688 dspfunc(put_h264_qpel, 3, 2); |
1168 | 4689 dspfunc(avg_h264_qpel, 0, 16); |
4690 dspfunc(avg_h264_qpel, 1, 8); | |
4691 dspfunc(avg_h264_qpel, 2, 4); | |
4692 | |
859 | 4693 #undef dspfunc |
1168 | 4694 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c; |
4695 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c; | |
4696 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c; | |
4697 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c; | |
4698 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c; | |
4699 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c; | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9437
diff
changeset
|
4700 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c; |
9440 | 4701 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c; |
857 | 4702 |
2415 | 4703 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; |
4704 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | |
4705 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c; | |
4706 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c; | |
4707 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c; | |
4708 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c; | |
4709 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c; | |
4710 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c; | |
4711 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c; | |
4712 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c; | |
4713 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c; | |
4714 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c; | |
4715 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c; | |
4716 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c; | |
4717 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c; | |
4718 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c; | |
4719 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c; | |
4720 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c; | |
4721 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c; | |
4722 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c; | |
4723 | |
6437 | 4724 c->draw_edges = draw_edges_c; |
4725 | |
8590 | 4726 #if CONFIG_CAVS_DECODER |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
4727 ff_cavsdsp_init(c,avctx); |
3432 | 4728 #endif |
9585 | 4729 |
4730 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER | |
4731 ff_mlp_init(c, avctx); | |
4732 #endif | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
4733 #if CONFIG_VC1_DECODER |
3526 | 4734 ff_vc1dsp_init(c,avctx); |
4735 #endif | |
9995
3141f69e3905
Do not check for both CONFIG_VC1_DECODER and CONFIG_WMV3_DECODER,
diego
parents:
9975
diff
changeset
|
4736 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER |
5887 | 4737 ff_intrax8dsp_init(c,avctx); |
4738 #endif | |
8590 | 4739 #if CONFIG_RV30_DECODER |
8410 | 4740 ff_rv30dsp_init(c,avctx); |
4741 #endif | |
8590 | 4742 #if CONFIG_RV40_DECODER |
8232 | 4743 ff_rv40dsp_init(c,avctx); |
4744 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c; | |
4745 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c; | |
4746 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c; | |
4747 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c; | |
4748 #endif | |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3373
diff
changeset
|
4749 |
936 | 4750 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c; |
4751 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c; | |
4752 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c; | |
4753 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c; | |
4754 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c; | |
4755 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c; | |
4756 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c; | |
4757 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c; | |
2967 | 4758 |
1708 | 4759 #define SET_CMP_FUNC(name) \ |
4760 c->name[0]= name ## 16_c;\ | |
4761 c->name[1]= name ## 8x8_c; | |
2967 | 4762 |
1708 | 4763 SET_CMP_FUNC(hadamard8_diff) |
1729 | 4764 c->hadamard8_diff[4]= hadamard8_intra16_c; |
8978 | 4765 c->hadamard8_diff[5]= hadamard8_intra8x8_c; |
1708 | 4766 SET_CMP_FUNC(dct_sad) |
2382 | 4767 SET_CMP_FUNC(dct_max) |
8590 | 4768 #if CONFIG_GPL |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
4769 SET_CMP_FUNC(dct264_sad) |
3013 | 4770 #endif |
1708 | 4771 c->sad[0]= pix_abs16_c; |
4772 c->sad[1]= pix_abs8_c; | |
4773 c->sse[0]= sse16_c; | |
4774 c->sse[1]= sse8_c; | |
2184 | 4775 c->sse[2]= sse4_c; |
1708 | 4776 SET_CMP_FUNC(quant_psnr) |
4777 SET_CMP_FUNC(rd) | |
4778 SET_CMP_FUNC(bit) | |
1729 | 4779 c->vsad[0]= vsad16_c; |
4780 c->vsad[4]= vsad_intra16_c; | |
8978 | 4781 c->vsad[5]= vsad_intra8_c; |
1729 | 4782 c->vsse[0]= vsse16_c; |
4783 c->vsse[4]= vsse_intra16_c; | |
8978 | 4784 c->vsse[5]= vsse_intra8_c; |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
4785 c->nsse[0]= nsse16_c; |
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
4786 c->nsse[1]= nsse8_c; |
8590 | 4787 #if CONFIG_SNOW_ENCODER |
2184 | 4788 c->w53[0]= w53_16_c; |
4789 c->w53[1]= w53_8_c; | |
4790 c->w97[0]= w97_16_c; | |
4791 c->w97[1]= w97_8_c; | |
3373
b8996cc5ccae
Disable w53 and w97 cmp methods when snow encoder is disabled
gpoirier
parents:
3323
diff
changeset
|
4792 #endif |
2184 | 4793 |
4749 | 4794 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c; |
4795 | |
866 | 4796 c->add_bytes= add_bytes_c; |
6384 | 4797 c->add_bytes_l2= add_bytes_l2_c; |
866 | 4798 c->diff_bytes= diff_bytes_c; |
8760 | 4799 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c; |
1527 | 4800 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c; |
10370 | 4801 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c; |
4802 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c; | |
1273 | 4803 c->bswap_buf= bswap_buf; |
8590 | 4804 #if CONFIG_PNG_DECODER |
6384 | 4805 c->add_png_paeth_prediction= ff_add_png_paeth_prediction; |
4806 #endif | |
2633 | 4807 |
4808 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c; | |
4809 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c; | |
8395
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
4810 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c; |
195cba8f6257
Move filter_luma_intra into dsputil for later addition of asm.
darkshikari
parents:
8375
diff
changeset
|
4811 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c; |
2633 | 4812 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c; |
4813 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c; | |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
4814 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c; |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
4815 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c; |
3645
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3568
diff
changeset
|
4816 c->h264_loop_filter_strength= NULL; |
2967 | 4817 |
10749
5cca4b6c459d
Get rid of pointless CONFIG_ANY_H263 preprocessor definition.
diego
parents:
10748
diff
changeset
|
4818 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
5278 | 4819 c->h263_h_loop_filter= h263_h_loop_filter_c; |
4820 c->h263_v_loop_filter= h263_v_loop_filter_c; | |
5277
7b3fcb7c61ce
Avoid linking with h263.c functions when the relevant codecs
aurel
parents:
5256
diff
changeset
|
4821 } |
2967 | 4822 |
9975
d6d7e8d4a04d
Do not redundantly check for both CONFIG_THEORA_DECODER and CONFIG_VP3_DECODER.
diego
parents:
9586
diff
changeset
|
4823 if (CONFIG_VP3_DECODER) { |
7995 | 4824 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c; |
4825 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c; | |
4826 } | |
8785
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4827 if (CONFIG_VP6_DECODER) { |
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4828 c->vp6_filter_diag4= ff_vp6_filter_diag4_c; |
bee83b3f9a6b
move vp6_filter_diag4() to a new vp6dsp.c file and use it throught dsputil
aurel
parents:
8760
diff
changeset
|
4829 } |
7995 | 4830 |
2045 | 4831 c->h261_loop_filter= h261_loop_filter_c; |
2967 | 4832 |
1784 | 4833 c->try_8x8basis= try_8x8basis_c; |
4834 c->add_8x8basis= add_8x8basis_c; | |
866 | 4835 |
8590 | 4836 #if CONFIG_SNOW_DECODER |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4837 c->vertical_compose97i = ff_snow_vertical_compose97i; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4838 c->horizontal_compose97i = ff_snow_horizontal_compose97i; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4839 c->inner_add_yblock = ff_snow_inner_add_yblock; |
3199
1651e69b9f7a
10l: Only set *compose97i *add_yblock to dsputils context if we are building with Snow enabled
gpoirier
parents:
3198
diff
changeset
|
4840 #endif |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3105
diff
changeset
|
4841 |
8590 | 4842 #if CONFIG_VORBIS_DECODER |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4843 c->vorbis_inverse_coupling = vorbis_inverse_coupling; |
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4844 #endif |
8590 | 4845 #if CONFIG_AC3_DECODER |
7563 | 4846 c->ac3_downmix = ff_ac3_downmix_c; |
4847 #endif | |
10429
289dd8daf4ee
add CONFIG_LPC to the build system for lpc dsputil functions. fixes build
jbr
parents:
10424
diff
changeset
|
4848 #if CONFIG_LPC |
10424
94595d0e617c
Move autocorrelation function from flacenc.c to lpc.c. Also rename the
jbr
parents:
10421
diff
changeset
|
4849 c->lpc_compute_autocorr = ff_lpc_compute_autocorr; |
10429
289dd8daf4ee
add CONFIG_LPC to the build system for lpc dsputil functions. fixes build
jbr
parents:
10424
diff
changeset
|
4850 #endif |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4851 c->vector_fmul = vector_fmul_c; |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4852 c->vector_fmul_reverse = vector_fmul_reverse_c; |
10300
4d1b9ca628fc
Drop unused args from vector_fmul_add_add, simpify code, and rename
mru
parents:
10219
diff
changeset
|
4853 c->vector_fmul_add = vector_fmul_add_c; |
7261 | 4854 c->vector_fmul_window = ff_vector_fmul_window_c; |
7564 | 4855 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; |
10104
0fa3d21b317e
SSE optimized vector_clipf(). 10% faster TwinVQ decoding.
vitor
parents:
10094
diff
changeset
|
4856 c->vector_clipf = vector_clipf_c; |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3536
diff
changeset
|
4857 c->float_to_int16 = ff_float_to_int16_c; |
7261 | 4858 c->float_to_int16_interleave = ff_float_to_int16_interleave_c; |
7203
87b1dfb5a98d
Add several vector functions used by Monkey's Audio decoder to dsputil
kostya
parents:
6719
diff
changeset
|
4859 c->scalarproduct_int16 = scalarproduct_int16_c; |
10644 | 4860 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; |
10219 | 4861 c->scalarproduct_float = scalarproduct_float_c; |
4862 c->butterflies_float = butterflies_float_c; | |
4863 c->vector_fmul_scalar = vector_fmul_scalar_c; | |
4864 | |
4865 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c; | |
4866 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c; | |
4867 | |
4868 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c; | |
4869 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c; | |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
4870 |
3245 | 4871 c->shrink[0]= ff_img_copy_plane; |
4872 c->shrink[1]= ff_shrink22; | |
4873 c->shrink[2]= ff_shrink44; | |
4874 c->shrink[3]= ff_shrink88; | |
4875 | |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4876 c->prefetch= just_return; |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3199
diff
changeset
|
4877 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4878 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab)); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4879 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab)); |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4880 |
8596
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4881 if (HAVE_MMX) dsputil_init_mmx (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4882 if (ARCH_ARM) dsputil_init_arm (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4883 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4884 if (HAVE_VIS) dsputil_init_vis (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4885 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4886 if (ARCH_PPC) dsputil_init_ppc (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4887 if (HAVE_MMI) dsputil_init_mmi (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4888 if (ARCH_SH4) dsputil_init_sh4 (c, avctx); |
68e959302527
replace all occurrence of ENABLE_ by the corresponding CONFIG_, HAVE_ or ARCH_
aurel
parents:
8590
diff
changeset
|
4889 if (ARCH_BFIN) dsputil_init_bfin (c, avctx); |
1092 | 4890 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4891 for(i=0; i<64; i++){ |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4892 if(!c->put_2tap_qpel_pixels_tab[0][i]) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4893 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4894 if(!c->avg_2tap_qpel_pixels_tab[0][i]) |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4895 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4896 } |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3728
diff
changeset
|
4897 |
1092 | 4898 switch(c->idct_permutation_type){ |
4899 case FF_NO_IDCT_PERM: | |
4900 for(i=0; i<64; i++) | |
4901 c->idct_permutation[i]= i; | |
4902 break; | |
4903 case FF_LIBMPEG2_IDCT_PERM: | |
4904 for(i=0; i<64; i++) | |
4905 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); | |
4906 break; | |
4907 case FF_SIMPLE_IDCT_PERM: | |
4908 for(i=0; i<64; i++) | |
4909 c->idct_permutation[i]= simple_mmx_permutation[i]; | |
4910 break; | |
4911 case FF_TRANSPOSE_IDCT_PERM: | |
4912 for(i=0; i<64; i++) | |
4913 c->idct_permutation[i]= ((i&7)<<3) | (i>>3); | |
4914 break; | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4915 case FF_PARTTRANS_IDCT_PERM: |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4916 for(i=0; i<64; i++) |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4917 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3); |
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
4918 break; |
6600
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4919 case FF_SSE2_IDCT_PERM: |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4920 for(i=0; i<64; i++) |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4921 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7]; |
c3213c91124c
Add a new IDCT permutation, used in xvid_sse2 and possibly future similar IDCTs.
astrange
parents:
6450
diff
changeset
|
4922 break; |
1092 | 4923 default: |
1598
932d306bf1dc
av_log() patch by (Michel Bardiaux <mbardiaux at peaktime dot be>)
michael
parents:
1571
diff
changeset
|
4924 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n"); |
1092 | 4925 } |
0 | 4926 } |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
220
diff
changeset
|
4927 |