Mercurial > libavcodec.hg
annotate dsputil.h @ 4843:0e11f292482f libavcodec
Replace hackish support for amr-nb and amr-wb. Instead of including the source
of the reference implementation it is possible to use proper libraries now.
patch by Stanislav Brabec, sbrabec suse cz, changes and bug fixes by me
author | diego |
---|---|
date | Thu, 12 Apr 2007 10:59:52 +0000 |
parents | 0d48b61e655d |
children | f99e40a7155b |
rev | line source |
---|---|
429 | 1 /* |
2 * DSP utils | |
3 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard. | |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1729
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
429 | 5 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
6 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
7 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
8 * FFmpeg is free software; you can redistribute it and/or |
429 | 9 * modify it under the terms of the GNU Lesser General Public |
10 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
11 * version 2.1 of the License, or (at your option) any later version. |
429 | 12 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
13 * FFmpeg is distributed in the hope that it will be useful, |
429 | 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3807
diff
changeset
|
19 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
3029
diff
changeset
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
429 | 21 */ |
1102 | 22 |
23 /** | |
24 * @file dsputil.h | |
1106 | 25 * DSP utils. |
1213 | 26 * note, many functions in here may use MMX which trashes the FPU state, it is |
27 * absolutely necessary to call emms_c() between dsp & float/double code | |
1102 | 28 */ |
29 | |
0 | 30 #ifndef DSPUTIL_H |
31 #define DSPUTIL_H | |
32 | |
33 #include "common.h" | |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
214
diff
changeset
|
34 #include "avcodec.h" |
0 | 35 |
1102 | 36 |
255 | 37 //#define DEBUG |
0 | 38 /* dct code */ |
39 typedef short DCTELEM; | |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3175
diff
changeset
|
40 typedef int DWTELEM; |
0 | 41 |
474
11dbd00682fc
avoid name clash with libjpeg - added missing externs
bellard
parents:
429
diff
changeset
|
42 void fdct_ifast (DCTELEM *data); |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1567
diff
changeset
|
43 void fdct_ifast248 (DCTELEM *data); |
625
bb6a69f9d409
slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)
michaelni
parents:
623
diff
changeset
|
44 void ff_jpeg_fdct_islow (DCTELEM *data); |
1567 | 45 void ff_fdct248_islow (DCTELEM *data); |
0 | 46 |
47 void j_rev_dct (DCTELEM *data); | |
2256 | 48 void j_rev_dct4 (DCTELEM *data); |
2257 | 49 void j_rev_dct2 (DCTELEM *data); |
2259 | 50 void j_rev_dct1 (DCTELEM *data); |
0 | 51 |
687
9abb13c21fbe
fdct_mmx -> ff_fdct_mmx (renamed to avoid namespace conflict with xvid)
arpi_esp
parents:
675
diff
changeset
|
52 void ff_fdct_mmx(DCTELEM *block); |
1565 | 53 void ff_fdct_mmx2(DCTELEM *block); |
1765
e31754bc5b65
SSE2 fdct by (Balatoni Denes <pnis at coder dot hu>)
michael
parents:
1739
diff
changeset
|
54 void ff_fdct_sse2(DCTELEM *block); |
0 | 55 |
2755 | 56 void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride); |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
57 void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride); |
3105
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
58 void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); |
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
59 void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
60 void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
61 void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); |
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
62 |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
63 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
64 const float *src2, int src3, int blocksize, int step); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
65 void ff_float_to_int16_c(int16_t *dst, const float *src, int len); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
66 |
34 | 67 /* encoding scans */ |
1064 | 68 extern const uint8_t ff_alternate_horizontal_scan[64]; |
69 extern const uint8_t ff_alternate_vertical_scan[64]; | |
70 extern const uint8_t ff_zigzag_direct[64]; | |
1567 | 71 extern const uint8_t ff_zigzag248_direct[64]; |
190
9e0e56869d05
fix for non-mmx runtimedetect encoding bugs - patch by Michael Niedermayer <michaelni@gmx.at>
uid46427
parents:
174
diff
changeset
|
72 |
0 | 73 /* pixel operations */ |
2090 | 74 #define MAX_NEG_CROP 1024 |
0 | 75 |
76 /* temporary */ | |
4179 | 77 extern uint32_t ff_squareTbl[512]; |
4176 | 78 extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP]; |
0 | 79 |
1866
1755f959ab7f
seperated out the C-based VP3 DSP functions into a different file; also
melanson
parents:
1864
diff
changeset
|
80 /* VP3 DSP functions */ |
2693 | 81 void ff_vp3_idct_c(DCTELEM *block/* align 16*/); |
82 void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); | |
83 void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); | |
0 | 84 |
3245 | 85 /* 1/2^n downscaling functions from imgconvert.c */ |
86 void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); | |
87 void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); | |
88 void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); | |
89 void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); | |
90 | |
3248
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
91 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
92 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); |
7aa9f80e7954
mmx implementation of 3-point GMC. (5x faster than C)
lorenm
parents:
3245
diff
changeset
|
93 |
675 | 94 /* minimum alignment rules ;) |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
95 if u notice errors in the align stuff, need more alignment for some asm code for some cpu |
675 | 96 or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ... |
97 | |
98 !warning these alignments might not match reallity, (missing attribute((align)) stuff somewhere possible) | |
99 i (michael) didnt check them, these are just the alignents which i think could be reached easily ... | |
100 | |
101 !future video codecs might need functions with less strict alignment | |
102 */ | |
0 | 103 |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
104 /* |
1064 | 105 void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size); |
106 void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride); | |
107 void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); | |
108 void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size); | |
296 | 109 void clear_blocks_c(DCTELEM *blocks); |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
110 */ |
0 | 111 |
112 /* add and put pixel (decoding) */ | |
675 | 113 // blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16 |
1709 | 114 //h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4 |
1064 | 115 typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h); |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
116 typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h); |
1064 | 117 typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); |
1168 | 118 typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); |
2415 | 119 typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); |
3029 | 120 typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); |
0 | 121 |
984 | 122 #define DEF_OLD_QPEL(name)\ |
1064 | 123 void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ |
124 void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\ | |
125 void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride); | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
126 |
984 | 127 DEF_OLD_QPEL(qpel16_mc11_old_c) |
128 DEF_OLD_QPEL(qpel16_mc31_old_c) | |
129 DEF_OLD_QPEL(qpel16_mc12_old_c) | |
130 DEF_OLD_QPEL(qpel16_mc32_old_c) | |
131 DEF_OLD_QPEL(qpel16_mc13_old_c) | |
132 DEF_OLD_QPEL(qpel16_mc33_old_c) | |
133 DEF_OLD_QPEL(qpel8_mc11_old_c) | |
134 DEF_OLD_QPEL(qpel8_mc31_old_c) | |
135 DEF_OLD_QPEL(qpel8_mc12_old_c) | |
136 DEF_OLD_QPEL(qpel8_mc32_old_c) | |
137 DEF_OLD_QPEL(qpel8_mc13_old_c) | |
138 DEF_OLD_QPEL(qpel8_mc33_old_c) | |
651 | 139 |
140 #define CALL_2X_PIXELS(a, b, n)\ | |
141 static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ | |
142 b(block , pixels , line_size, h);\ | |
143 b(block+n, pixels+n, line_size, h);\ | |
144 } | |
255 | 145 |
0 | 146 /* motion estimation */ |
1709 | 147 // h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2 |
148 // allthough currently h<4 is not used as functions with width <8 are not used and neither implemented | |
1708 | 149 typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/; |
936 | 150 |
1168 | 151 |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3175
diff
changeset
|
152 // for snow slices |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3175
diff
changeset
|
153 typedef struct slice_buffer_s slice_buffer; |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3175
diff
changeset
|
154 |
1102 | 155 /** |
156 * DSPContext. | |
157 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
158 typedef struct DSPContext { |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
159 /* pixel ops : interface with DCT */ |
1064 | 160 void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size); |
161 void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride); | |
162 void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); | |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
163 void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); |
1064 | 164 void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size); |
2763 | 165 void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size); |
166 void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size); | |
1136 | 167 /** |
168 * translational global motion compensation. | |
169 */ | |
1064 | 170 void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder); |
1136 | 171 /** |
172 * global motion compensation. | |
173 */ | |
1064 | 174 void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy, |
2979 | 175 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
176 void (*clear_blocks)(DCTELEM *blocks/*align 16*/); |
1064 | 177 int (*pix_sum)(uint8_t * pix, int line_size); |
178 int (*pix_norm1)(uint8_t * pix, int line_size); | |
1708 | 179 // 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4 |
2967 | 180 |
1729 | 181 me_cmp_func sad[5]; /* identical to pix_absAxA except additional void * */ |
182 me_cmp_func sse[5]; | |
183 me_cmp_func hadamard8_diff[5]; | |
184 me_cmp_func dct_sad[5]; | |
185 me_cmp_func quant_psnr[5]; | |
186 me_cmp_func bit[5]; | |
187 me_cmp_func rd[5]; | |
188 me_cmp_func vsad[5]; | |
189 me_cmp_func vsse[5]; | |
2065
9e4bebc39ade
noise preserving sum of squares comparission function
michael
parents:
2045
diff
changeset
|
190 me_cmp_func nsse[5]; |
2184 | 191 me_cmp_func w53[5]; |
192 me_cmp_func w97[5]; | |
2382 | 193 me_cmp_func dct_max[5]; |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
194 me_cmp_func dct264_sad[5]; |
936 | 195 |
1708 | 196 me_cmp_func me_pre_cmp[5]; |
197 me_cmp_func me_cmp[5]; | |
198 me_cmp_func me_sub_cmp[5]; | |
199 me_cmp_func mb_cmp[5]; | |
1729 | 200 me_cmp_func ildct_cmp[5]; //only width 16 used |
2382 | 201 me_cmp_func frame_skip_cmp[5]; //only width 8 used |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
202 |
4749 | 203 int (*ssd_int8_vs_int16)(int8_t *pix1, int16_t *pix2, int size); |
204 | |
1136 | 205 /** |
206 * Halfpel motion compensation with rounding (a+b+1)>>1. | |
4751 | 207 * this is an array[4][4] of motion compensation functions for 4 |
1713 | 208 * horizontal blocksizes (8,16) and the 4 halfpel positions<br> |
1213 | 209 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] |
1136 | 210 * @param block destination where the result is stored |
211 * @param pixels source | |
212 * @param line_size number of bytes in a horizontal line of block | |
213 * @param h height | |
214 */ | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
215 op_pixels_func put_pixels_tab[4][4]; |
1136 | 216 |
217 /** | |
218 * Halfpel motion compensation with rounding (a+b+1)>>1. | |
2967 | 219 * This is an array[4][4] of motion compensation functions for 4 |
1713 | 220 * horizontal blocksizes (8,16) and the 4 halfpel positions<br> |
1213 | 221 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] |
1136 | 222 * @param block destination into which the result is averaged (a+b+1)>>1 |
223 * @param pixels source | |
224 * @param line_size number of bytes in a horizontal line of block | |
225 * @param h height | |
226 */ | |
1319 | 227 op_pixels_func avg_pixels_tab[4][4]; |
1136 | 228 |
229 /** | |
230 * Halfpel motion compensation with no rounding (a+b)>>1. | |
4751 | 231 * this is an array[2][4] of motion compensation functions for 2 |
1225 | 232 * horizontal blocksizes (8,16) and the 4 halfpel positions<br> |
1213 | 233 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] |
1136 | 234 * @param block destination where the result is stored |
235 * @param pixels source | |
236 * @param line_size number of bytes in a horizontal line of block | |
237 * @param h height | |
238 */ | |
2075 | 239 op_pixels_func put_no_rnd_pixels_tab[4][4]; |
1136 | 240 |
241 /** | |
242 * Halfpel motion compensation with no rounding (a+b)>>1. | |
4751 | 243 * this is an array[2][4] of motion compensation functions for 2 |
1225 | 244 * horizontal blocksizes (8,16) and the 4 halfpel positions<br> |
1213 | 245 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] |
1136 | 246 * @param block destination into which the result is averaged (a+b)>>1 |
247 * @param pixels source | |
248 * @param line_size number of bytes in a horizontal line of block | |
249 * @param h height | |
250 */ | |
2075 | 251 op_pixels_func avg_no_rnd_pixels_tab[4][4]; |
2967 | 252 |
1864 | 253 void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h); |
2967 | 254 |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
255 /** |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
256 * Thirdpel motion compensation with rounding (a+b+1)>>1. |
4751 | 257 * this is an array[12] of motion compensation functions for the 9 thirdpe |
258 * positions<br> | |
1267
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
259 * *pixels_tab[ xthirdpel + 4*ythirdpel ] |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
260 * @param block destination where the result is stored |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
261 * @param pixels source |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
262 * @param line_size number of bytes in a horizontal line of block |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
263 * @param h height |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
264 */ |
85b71f9f7450
moving the svq3 motion compensation stuff to dsputil (this also means that existing optimized halfpel code is used now ...)
michaelni
parents:
1264
diff
changeset
|
265 tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width? |
1319 | 266 tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width? |
267 | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
268 qpel_mc_func put_qpel_pixels_tab[2][16]; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
269 qpel_mc_func avg_qpel_pixels_tab[2][16]; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
270 qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16]; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
271 qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16]; |
936 | 272 qpel_mc_func put_mspel_pixels_tab[8]; |
2967 | 273 |
1168 | 274 /** |
4751 | 275 * h264 Chroma MC |
1168 | 276 */ |
277 h264_chroma_mc_func put_h264_chroma_pixels_tab[3]; | |
3663 | 278 /* This is really one func used in VC-1 decoding */ |
279 h264_chroma_mc_func put_no_rnd_h264_chroma_pixels_tab[3]; | |
1168 | 280 h264_chroma_mc_func avg_h264_chroma_pixels_tab[3]; |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
281 |
3020
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3010
diff
changeset
|
282 qpel_mc_func put_h264_qpel_pixels_tab[4][16]; |
c75fb0747e74
use h264 MC functions for 2xX Xx2 blocks in snow too
michael
parents:
3010
diff
changeset
|
283 qpel_mc_func avg_h264_qpel_pixels_tab[4][16]; |
2967 | 284 |
3807
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3746
diff
changeset
|
285 qpel_mc_func put_2tap_qpel_pixels_tab[4][16]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3746
diff
changeset
|
286 qpel_mc_func avg_2tap_qpel_pixels_tab[4][16]; |
6a40092eb9e6
approximate qpel functions: sacrifice some quality for some decoding speed. enabled on B-frames with -lavdopts fast.
lorenm
parents:
3746
diff
changeset
|
287 |
2415 | 288 h264_weight_func weight_h264_pixels_tab[10]; |
289 h264_biweight_func biweight_h264_pixels_tab[10]; | |
2967 | 290 |
3395
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
291 /* AVS specific */ |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
292 qpel_mc_func put_cavs_qpel_pixels_tab[2][16]; |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
293 qpel_mc_func avg_cavs_qpel_pixels_tab[2][16]; |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
294 void (*cavs_filter_lv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
295 void (*cavs_filter_lh)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
296 void (*cavs_filter_cv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
297 void (*cavs_filter_ch)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
298 void (*cavs_idct8_add)(uint8_t *dst, DCTELEM *block, int stride); |
adccbf4a1040
CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
michael
parents:
3279
diff
changeset
|
299 |
1708 | 300 me_cmp_func pix_abs[2][4]; |
2967 | 301 |
866 | 302 /* huffyuv specific */ |
303 void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w); | |
936 | 304 void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w); |
1527 | 305 /** |
306 * subtract huffyuv's variant of median prediction | |
307 * note, this might read from src1[-1], src2[-1] | |
308 */ | |
309 void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top); | |
1273 | 310 void (*bswap_buf)(uint32_t *dst, uint32_t *src, int w); |
2633 | 311 |
2707
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
312 void (*h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
313 void (*h264_h_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
314 void (*h264_v_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
315 void (*h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
316 void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta); |
360024d31dab
H.264 deblocking optimizations (mmx for chroma_bS4 case, convert existing cases to 8-bit math)
lorenm
parents:
2696
diff
changeset
|
317 void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta); |
3645
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3574
diff
changeset
|
318 // h264_loop_filter_strength: simd only. the C version is inlined in h264.c |
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3574
diff
changeset
|
319 void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], |
47821be55b6c
mmx implementation of deblocking strength decision.
lorenm
parents:
3574
diff
changeset
|
320 int bidir, int edges, int step, int mask_mv0, int mask_mv1); |
2967 | 321 |
1644 | 322 void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale); |
323 void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale); | |
324 | |
2045 | 325 void (*h261_loop_filter)(uint8_t *src, int stride); |
2044
b6f2add2511e
h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
michael
parents:
2024
diff
changeset
|
326 |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
327 /* assume len is a multiple of 4, and arrays are 16-byte aligned */ |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
328 void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); |
3574 | 329 /* assume len is a multiple of 8, and arrays are 16-byte aligned */ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
330 void (*vector_fmul)(float *dst, const float *src, int len); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
331 void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
332 /* assume len is a multiple of 8, and src arrays are 16-byte aligned */ |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
333 void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step); |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
334 |
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
335 /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767] |
3660
7e1ee254a3ee
Align the input buffer in ffplay, introduce a public macro for aligned declarations
lu_zero
parents:
3656
diff
changeset
|
336 * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ |
3568
945caa35ee9a
sse and 3dnow implementations of float->int conversion and mdct windowing.
lorenm
parents:
3555
diff
changeset
|
337 void (*float_to_int16)(int16_t *dst, const float *src, int len); |
3536
545a15c19c91
sse & sse2 implementations of vorbis channel coupling.
lorenm
parents:
3526
diff
changeset
|
338 |
1092 | 339 /* (I)DCT */ |
340 void (*fdct)(DCTELEM *block/* align 16*/); | |
1567 | 341 void (*fdct248)(DCTELEM *block/* align 16*/); |
2967 | 342 |
1324
7d328fd9d8a5
the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents:
1320
diff
changeset
|
343 /* IDCT really*/ |
7d328fd9d8a5
the return of the idct with 16bit output by ("Ivan Kalvachev" <ivan at cacad dot com>)
michaelni
parents:
1320
diff
changeset
|
344 void (*idct)(DCTELEM *block/* align 16*/); |
2967 | 345 |
1102 | 346 /** |
1104 | 347 * block -> idct -> clip to unsigned 8 bit -> dest. |
1102 | 348 * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...) |
1111 | 349 * @param line_size size in bytes of a horizotal line of dest |
1102 | 350 */ |
1092 | 351 void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); |
2967 | 352 |
1102 | 353 /** |
354 * block -> idct -> add dest -> clip to unsigned 8 bit -> dest. | |
1111 | 355 * @param line_size size in bytes of a horizotal line of dest |
1102 | 356 */ |
1092 | 357 void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); |
2967 | 358 |
1102 | 359 /** |
1104 | 360 * idct input permutation. |
1241 | 361 * several optimized IDCTs need a permutated input (relative to the normal order of the reference |
362 * IDCT) | |
363 * this permutation must be performed before the idct_put/add, note, normally this can be merged | |
364 * with the zigzag/alternate scan<br> | |
1102 | 365 * an example to avoid confusion: |
366 * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...) | |
367 * - (x -> referece dct -> reference idct -> x) | |
368 * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x) | |
369 * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...) | |
370 */ | |
1092 | 371 uint8_t idct_permutation[64]; |
372 int idct_permutation_type; | |
373 #define FF_NO_IDCT_PERM 1 | |
374 #define FF_LIBMPEG2_IDCT_PERM 2 | |
375 #define FF_SIMPLE_IDCT_PERM 3 | |
376 #define FF_TRANSPOSE_IDCT_PERM 4 | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
2693
diff
changeset
|
377 #define FF_PARTTRANS_IDCT_PERM 5 |
1092 | 378 |
1784 | 379 int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale); |
380 void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale); | |
381 #define BASIS_SHIFT 16 | |
382 #define RECON_SHIFT 6 | |
2967 | 383 |
4268 | 384 /* h264 functions */ |
2272
cd43603c46f9
move h264 idct to its own file and call via function pointer in DspContext
michael
parents:
2259
diff
changeset
|
385 void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride); |
2755 | 386 void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride); |
3105
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
387 void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride); |
2d35fb3cb940
h264: special case dc-only idct. ~1% faster overall
lorenm
parents:
3089
diff
changeset
|
388 void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride); |
4279 | 389 void (*h264_dct)(DCTELEM block[4][4]); |
3198
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3175
diff
changeset
|
390 |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3175
diff
changeset
|
391 /* snow wavelet */ |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3175
diff
changeset
|
392 void (*vertical_compose97i)(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width); |
6b9f0c4fbdbe
First part of a series of speed-enchancing patches.
gpoirier
parents:
3175
diff
changeset
|
393 void (*horizontal_compose97i)(DWTELEM *b, int width); |
4436
d3e389536b0a
Add the const specifier as needed to reduce the number of warnings.
takis
parents:
4311
diff
changeset
|
394 void (*inner_add_yblock)(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); |
3215
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3198
diff
changeset
|
395 |
06f98047ff26
prefetch pixels for future motion compensation. 2-5% faster h264.
lorenm
parents:
3198
diff
changeset
|
396 void (*prefetch)(void *mem, int stride, int h); |
3245 | 397 |
398 void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); | |
3526 | 399 |
400 /* vc1 functions */ | |
401 void (*vc1_inv_trans_8x8)(DCTELEM *b); | |
402 void (*vc1_inv_trans_8x4)(DCTELEM *b, int n); | |
403 void (*vc1_inv_trans_4x8)(DCTELEM *b, int n); | |
404 void (*vc1_inv_trans_4x4)(DCTELEM *b, int n); | |
4239 | 405 void (*vc1_v_overlap)(uint8_t* src, int stride); |
406 void (*vc1_h_overlap)(uint8_t* src, int stride); | |
3526 | 407 /* put 8x8 block with bicubic interpolation and quarterpel precision |
408 * last argument is actually round value instead of height | |
409 */ | |
410 op_pixels_func put_vc1_mspel_pixels_tab[16]; | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
411 } DSPContext; |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
412 |
4197 | 413 void dsputil_static_init(void); |
1092 | 414 void dsputil_init(DSPContext* p, AVCodecContext *avctx); |
0 | 415 |
4281
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4279
diff
changeset
|
416 int ff_check_alignment(void); |
de525a2b41db
ff_check_alignment to warn the user about a missaligned stack
michael
parents:
4279
diff
changeset
|
417 |
764 | 418 /** |
419 * permute block according to permuatation. | |
420 * @param last last non zero element in scantable order | |
421 */ | |
1064 | 422 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last); |
34 | 423 |
1729 | 424 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type); |
425 | |
2979 | 426 #define BYTE_VEC32(c) ((c)*0x01010101UL) |
1264 | 427 |
428 static inline uint32_t rnd_avg32(uint32_t a, uint32_t b) | |
429 { | |
430 return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); | |
431 } | |
432 | |
433 static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b) | |
434 { | |
435 return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1); | |
436 } | |
437 | |
2184 | 438 static inline int get_penalty_factor(int lambda, int lambda2, int type){ |
439 switch(type&0xFF){ | |
440 default: | |
441 case FF_CMP_SAD: | |
442 return lambda>>FF_LAMBDA_SHIFT; | |
443 case FF_CMP_DCT: | |
444 return (3*lambda)>>(FF_LAMBDA_SHIFT+1); | |
445 case FF_CMP_W53: | |
446 return (4*lambda)>>(FF_LAMBDA_SHIFT); | |
447 case FF_CMP_W97: | |
448 return (2*lambda)>>(FF_LAMBDA_SHIFT); | |
449 case FF_CMP_SATD: | |
3010
533c6386eca9
8x8 integer dct from x264 as cmp function (under CONFIG_GPL)
michael
parents:
2979
diff
changeset
|
450 case FF_CMP_DCT264: |
2184 | 451 return (2*lambda)>>FF_LAMBDA_SHIFT; |
452 case FF_CMP_RD: | |
453 case FF_CMP_PSNR: | |
454 case FF_CMP_SSE: | |
455 case FF_CMP_NSSE: | |
456 return lambda2>>FF_LAMBDA_SHIFT; | |
457 case FF_CMP_BIT: | |
458 return 1; | |
459 } | |
460 } | |
461 | |
1102 | 462 /** |
1104 | 463 * Empty mmx state. |
1102 | 464 * this must be called between any dsp function and float/double code. |
465 * for example sin(); dsp->idct_put(); emms_c(); cos() | |
466 */ | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
467 #define emms_c() |
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
468 |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
984
diff
changeset
|
469 /* should be defined by architectures supporting |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
984
diff
changeset
|
470 one or more MultiMedia extension */ |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
984
diff
changeset
|
471 int mm_support(void); |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
984
diff
changeset
|
472 |
3089 | 473 #ifdef __GNUC__ |
474 #define DECLARE_ALIGNED_16(t,v) t v __attribute__ ((aligned (16))) | |
475 #else | |
476 #define DECLARE_ALIGNED_16(t,v) __declspec(align(16)) t v | |
477 #endif | |
1974
8c5489b2cf3e
move __align16 some place where non-MMX machines can see it
melanson
parents:
1972
diff
changeset
|
478 |
62 | 479 #if defined(HAVE_MMX) |
0 | 480 |
862 | 481 #undef emms_c |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
482 |
0 | 483 #define MM_MMX 0x0001 /* standard MMX */ |
484 #define MM_3DNOW 0x0004 /* AMD 3DNOW */ | |
485 #define MM_MMXEXT 0x0002 /* SSE integer functions or AMD MMX ext */ | |
486 #define MM_SSE 0x0008 /* SSE functions */ | |
487 #define MM_SSE2 0x0010 /* PIV SSE2 functions */ | |
2388 | 488 #define MM_3DNOWEXT 0x0020 /* AMD 3DNowExt */ |
3279
647a677c00a4
Remove unused and unsupported Cyrix's "Extended MMX",
gpoirier
parents:
3248
diff
changeset
|
489 #define MM_SSE3 0x0040 /* Prescott SSE3 functions */ |
4311
f9325f2bc762
Add SSSE3 (Core2 aka Conroe/Merom/Woodcrester new instructions) detection
gpoirier
parents:
4281
diff
changeset
|
490 #define MM_SSSE3 0x0080 /* Conroe SSSE3 functions */ |
0 | 491 |
4197 | 492 extern int mm_flags; |
493 | |
1064 | 494 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); |
495 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); | |
1984
ef919e9ef73e
separate out put_signed_pixels_clamped() into its own function and
melanson
parents:
1977
diff
changeset
|
496 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); |
0 | 497 |
498 static inline void emms(void) | |
499 { | |
6
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
500 __asm __volatile ("emms;":::"memory"); |
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
501 } |
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
502 |
936 | 503 |
6
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
504 #define emms_c() \ |
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
505 {\ |
4197 | 506 if (mm_flags & MM_MMX)\ |
6
ec4642daa6fe
added emms_c() macro which should can used in c code in both mmx/non mmx cases
glantau
parents:
2
diff
changeset
|
507 emms();\ |
0 | 508 } |
509 | |
3089 | 510 #ifdef __GNUC__ |
511 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (8))) | |
512 #else | |
513 #define DECLARE_ALIGNED_8(t,v) __declspec(align(8)) t v | |
514 #endif | |
515 | |
2324 | 516 #define STRIDE_ALIGN 8 |
0 | 517 |
1092 | 518 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx); |
519 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); | |
1065 | 520 |
62 | 521 #elif defined(ARCH_ARMV4L) |
522 | |
523 /* This is to use 4 bytes read to the IDCT pointers for some 'zero' | |
1974
8c5489b2cf3e
move __align16 some place where non-MMX machines can see it
melanson
parents:
1972
diff
changeset
|
524 line optimizations */ |
3089 | 525 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (4))) |
2324 | 526 #define STRIDE_ALIGN 4 |
62 | 527 |
2776
930e56f92c57
IWMMXT configure support + runtime selection patch by (Gildas Bazin, gbazin : altern org)
michael
parents:
2763
diff
changeset
|
528 #define MM_IWMMXT 0x0100 /* XScale IWMMXT */ |
930e56f92c57
IWMMXT configure support + runtime selection patch by (Gildas Bazin, gbazin : altern org)
michael
parents:
2763
diff
changeset
|
529 |
4197 | 530 extern int mm_flags; |
531 | |
1092 | 532 void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx); |
62 | 533 |
88 | 534 #elif defined(HAVE_MLIB) |
535 | |
536 /* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */ | |
3089 | 537 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (8))) |
2324 | 538 #define STRIDE_ALIGN 8 |
88 | 539 |
1092 | 540 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx); |
88 | 541 |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
1879
diff
changeset
|
542 #elif defined(ARCH_SPARC) |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
1879
diff
changeset
|
543 |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
1879
diff
changeset
|
544 /* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */ |
3089 | 545 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (8))) |
2324 | 546 #define STRIDE_ALIGN 8 |
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
1879
diff
changeset
|
547 void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx); |
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
1879
diff
changeset
|
548 |
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
549 #elif defined(ARCH_ALPHA) |
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
550 |
3089 | 551 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (8))) |
2324 | 552 #define STRIDE_ALIGN 8 |
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
553 |
1092 | 554 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx); |
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
190
diff
changeset
|
555 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
556 #elif defined(ARCH_POWERPC) |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
557 |
894
a408778eff87
altivec accelerated v-resample patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
884
diff
changeset
|
558 #define MM_ALTIVEC 0x0001 /* standard AltiVec */ |
a408778eff87
altivec accelerated v-resample patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
884
diff
changeset
|
559 |
4197 | 560 extern int mm_flags; |
561 | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1008
diff
changeset
|
562 #if defined(HAVE_ALTIVEC) && !defined(CONFIG_DARWIN) |
1653 | 563 #define pixel altivec_pixel |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1008
diff
changeset
|
564 #include <altivec.h> |
1653 | 565 #undef pixel |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1008
diff
changeset
|
566 #endif |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1008
diff
changeset
|
567 |
3089 | 568 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (16))) |
2324 | 569 #define STRIDE_ALIGN 16 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
570 |
1092 | 571 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
612
diff
changeset
|
572 |
689
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
573 #elif defined(HAVE_MMI) |
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
574 |
3089 | 575 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (16))) |
2324 | 576 #define STRIDE_ALIGN 16 |
689
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
577 |
1092 | 578 void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx); |
689
efcbfbd18864
ps2 idct patch by (Leon van Stuivenberg <leonvs at iae dot nl>)
michaelni
parents:
687
diff
changeset
|
579 |
1259
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
580 #elif defined(ARCH_SH4) |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
581 |
3089 | 582 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (8))) |
2324 | 583 #define STRIDE_ALIGN 8 |
1259
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
584 |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
585 void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx); |
e8c3884f2c7e
sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
michaelni
parents:
1241
diff
changeset
|
586 |
3728 | 587 #elif defined(ARCH_BFIN) |
588 | |
589 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (8))) | |
590 #define STRIDE_ALIGN 8 | |
591 | |
592 void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx); | |
593 | |
0 | 594 #else |
595 | |
3089 | 596 #define DECLARE_ALIGNED_8(t,v) t v __attribute__ ((aligned (8))) |
2324 | 597 #define STRIDE_ALIGN 8 |
0 | 598 |
599 #endif | |
600 | |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
214
diff
changeset
|
601 /* PSNR */ |
1064 | 602 void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3], |
252
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
214
diff
changeset
|
603 int orig_linesize[3], int coded_linesize, |
ddb1a0e94cf4
- Added PSNR feature to libavcodec and ffmpeg. By now just Y PSNR until I'm
pulento
parents:
214
diff
changeset
|
604 AVCodecContext *avctx); |
781 | 605 |
606 /* FFT computation */ | |
607 | |
608 /* NOTE: soon integer code will be added, so you must use the | |
609 FFTSample type */ | |
610 typedef float FFTSample; | |
611 | |
3555 | 612 struct MDCTContext; |
613 | |
781 | 614 typedef struct FFTComplex { |
615 FFTSample re, im; | |
616 } FFTComplex; | |
617 | |
618 typedef struct FFTContext { | |
619 int nbits; | |
620 int inverse; | |
621 uint16_t *revtab; | |
622 FFTComplex *exptab; | |
623 FFTComplex *exptab1; /* only used by SSE code */ | |
624 void (*fft_calc)(struct FFTContext *s, FFTComplex *z); | |
3555 | 625 void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, |
626 const FFTSample *input, FFTSample *tmp); | |
781 | 627 } FFTContext; |
628 | |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1866
diff
changeset
|
629 int ff_fft_init(FFTContext *s, int nbits, int inverse); |
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1866
diff
changeset
|
630 void ff_fft_permute(FFTContext *s, FFTComplex *z); |
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1866
diff
changeset
|
631 void ff_fft_calc_c(FFTContext *s, FFTComplex *z); |
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1866
diff
changeset
|
632 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); |
3175 | 633 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z); |
634 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z); | |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1866
diff
changeset
|
635 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
954
diff
changeset
|
636 |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1866
diff
changeset
|
637 static inline void ff_fft_calc(FFTContext *s, FFTComplex *z) |
781 | 638 { |
639 s->fft_calc(s, z); | |
640 } | |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1866
diff
changeset
|
641 void ff_fft_end(FFTContext *s); |
781 | 642 |
643 /* MDCT computation */ | |
644 | |
645 typedef struct MDCTContext { | |
646 int n; /* size of MDCT (i.e. number of input data * 2) */ | |
647 int nbits; /* n = 2^nbits */ | |
648 /* pre/post rotation tables */ | |
649 FFTSample *tcos; | |
650 FFTSample *tsin; | |
651 FFTContext fft; | |
652 } MDCTContext; | |
653 | |
794 | 654 int ff_mdct_init(MDCTContext *s, int nbits, int inverse); |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
655 void ff_imdct_calc(MDCTContext *s, FFTSample *output, |
781 | 656 const FFTSample *input, FFTSample *tmp); |
3555 | 657 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, |
658 const FFTSample *input, FFTSample *tmp); | |
3746 | 659 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, |
660 const FFTSample *input, FFTSample *tmp); | |
853
eacc2dd8fd9d
* using DSPContext - so each codec could use its local (sub)set of CPU extension
kabi
parents:
838
diff
changeset
|
661 void ff_mdct_calc(MDCTContext *s, FFTSample *out, |
781 | 662 const FFTSample *input, FFTSample *tmp); |
794 | 663 void ff_mdct_end(MDCTContext *s); |
781 | 664 |
1708 | 665 #define WARPER8_16(name8, name16)\ |
666 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ | |
667 return name8(s, dst , src , stride, h)\ | |
668 +name8(s, dst+8 , src+8 , stride, h);\ | |
669 } | |
670 | |
671 #define WARPER8_16_SQ(name8, name16)\ | |
672 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\ | |
673 int score=0;\ | |
674 score +=name8(s, dst , src , stride, 8);\ | |
675 score +=name8(s, dst+8 , src+8 , stride, 8);\ | |
676 if(h==16){\ | |
677 dst += 8*stride;\ | |
678 src += 8*stride;\ | |
679 score +=name8(s, dst , src , stride, 8);\ | |
680 score +=name8(s, dst+8 , src+8 , stride, 8);\ | |
681 }\ | |
682 return score;\ | |
936 | 683 } |
684 | |
4240 | 685 |
686 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) | |
687 { | |
688 int i; | |
689 for(i=0; i<h; i++) | |
690 { | |
691 ST16(dst , LD16(src )); | |
692 dst+=dstStride; | |
693 src+=srcStride; | |
694 } | |
695 } | |
696 | |
697 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) | |
698 { | |
699 int i; | |
700 for(i=0; i<h; i++) | |
701 { | |
702 ST32(dst , LD32(src )); | |
703 dst+=dstStride; | |
704 src+=srcStride; | |
705 } | |
706 } | |
707 | |
708 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) | |
709 { | |
710 int i; | |
711 for(i=0; i<h; i++) | |
712 { | |
713 ST32(dst , LD32(src )); | |
714 ST32(dst+4 , LD32(src+4 )); | |
715 dst+=dstStride; | |
716 src+=srcStride; | |
717 } | |
718 } | |
719 | |
720 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) | |
721 { | |
722 int i; | |
723 for(i=0; i<h; i++) | |
724 { | |
725 ST32(dst , LD32(src )); | |
726 ST32(dst+4 , LD32(src+4 )); | |
727 dst[8]= src[8]; | |
728 dst+=dstStride; | |
729 src+=srcStride; | |
730 } | |
731 } | |
732 | |
733 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) | |
734 { | |
735 int i; | |
736 for(i=0; i<h; i++) | |
737 { | |
738 ST32(dst , LD32(src )); | |
739 ST32(dst+4 , LD32(src+4 )); | |
740 ST32(dst+8 , LD32(src+8 )); | |
741 ST32(dst+12, LD32(src+12)); | |
742 dst+=dstStride; | |
743 src+=srcStride; | |
744 } | |
745 } | |
746 | |
747 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) | |
748 { | |
749 int i; | |
750 for(i=0; i<h; i++) | |
751 { | |
752 ST32(dst , LD32(src )); | |
753 ST32(dst+4 , LD32(src+4 )); | |
754 ST32(dst+8 , LD32(src+8 )); | |
755 ST32(dst+12, LD32(src+12)); | |
756 dst[16]= src[16]; | |
757 dst+=dstStride; | |
758 src+=srcStride; | |
759 } | |
760 } | |
761 | |
1040
998d5035b15b
win32: rint() does not seem to be defined with mingw32-gcc 2.95 - do you have a better solution ?
bellard
parents:
1033
diff
changeset
|
762 #endif |