annotate ppc/h264_altivec.c @ 6323:e6da66f378c7 libavcodec

mpegvideo.h has two function declarations with the 'inline' specifier but no definition for those functions. The C standard requires a definition to appear in the same translation unit for any function declared with 'inline'. Most of the files including mpegvideo.h do not define those functions. Fix this by removing the 'inline' specifiers from the header. patch by Uoti Urpala
author diego
date Sun, 03 Feb 2008 17:54:30 +0000
parents 8baa533764d4
children f7cbb7733146
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
1 /*
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
3 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3667
diff changeset
4 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3667
diff changeset
5 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3667
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3667
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
10 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3667
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
14 * Lesser General Public License for more details.
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
15 *
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3667
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2979
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
19 */
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
20
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 4294
diff changeset
21 #include "dsputil.h"
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
22
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
23 #include "gcc_fixes.h"
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
24
5750
09f99af1db40 Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents: 5586
diff changeset
25 #include "dsputil_ppc.h"
6077
8baa533764d4 Add necessary #include, fixes the warnings:
diego
parents: 5750
diff changeset
26 #include "dsputil_altivec.h"
5750
09f99af1db40 Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents: 5586
diff changeset
27 #include "util_altivec.h"
4260
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
28 #include "types_altivec.h"
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
29
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
30 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
31 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
32
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
33 #define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
34 #define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
35 #define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
36 #define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
37 #define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
38 #define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
39 #define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
40 #define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
41 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num
3577
5be5a936c8a9 Clean up:make dsputil subfile names consistent
lu_zero
parents: 3544
diff changeset
42 #include "h264_template_altivec.c"
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
43 #undef OP_U8_ALTIVEC
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
44 #undef PREFIX_h264_chroma_mc8_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
45 #undef PREFIX_h264_chroma_mc8_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
46 #undef PREFIX_h264_qpel16_h_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
47 #undef PREFIX_h264_qpel16_h_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
48 #undef PREFIX_h264_qpel16_v_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
49 #undef PREFIX_h264_qpel16_v_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
50 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
51 #undef PREFIX_h264_qpel16_hv_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
52
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
53 #define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
54 #define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
55 #define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
56 #define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
57 #define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
58 #define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
59 #define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
60 #define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
61 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num
3577
5be5a936c8a9 Clean up:make dsputil subfile names consistent
lu_zero
parents: 3544
diff changeset
62 #include "h264_template_altivec.c"
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
63 #undef OP_U8_ALTIVEC
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
64 #undef PREFIX_h264_chroma_mc8_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
65 #undef PREFIX_h264_chroma_mc8_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
66 #undef PREFIX_h264_qpel16_h_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
67 #undef PREFIX_h264_qpel16_h_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
68 #undef PREFIX_h264_qpel16_v_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
69 #undef PREFIX_h264_qpel16_v_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
70 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
71 #undef PREFIX_h264_qpel16_hv_lowpass_num
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
72
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
73 #define H264_MC(OPNAME, SIZE, CODETYPE) \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
74 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
75 OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
76 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
77 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
78 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
79 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
80 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
81 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
82 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
83 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
84 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
85 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
86 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
87 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
88 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
89 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
90 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
91 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
92 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
93 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
94 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
95 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
96 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
97 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
98 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
99 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
100 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
101 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
102 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
103 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
104 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
105 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
106 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
107 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
108 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
109 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
110 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
111 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
112 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
113 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
114 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
115 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
116 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
117 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
118 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
119 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
120 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
121 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
122 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
123 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
124 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
125 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
126 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
127 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
128 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
129 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
130 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
131 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
132 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
133 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
134 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
135 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
136 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
137 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
138 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
139 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
140 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
141 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
142 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
143 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
144 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
145 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
146 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
147 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
148 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
149 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
150 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
151 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
152 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
153 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
154 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
155 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
156 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
157 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
158 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
159 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
160 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
161 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
162 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
163 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
164 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
165 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
166 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
167 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
168 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
169 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
170 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
171 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
172 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
173 \
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
174 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
3544
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
175 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
176 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
8bb61d9a2c40 avoid alignment hacks, luckly gcc does the right thing on arches different from x86
lu_zero
parents: 3337
diff changeset
177 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
178 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
179 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
180 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
181 }\
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
182
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
183 /* this code assume that stride % 16 == 0 */
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
184 void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
5019
41cabe79ba25 use macro Use DECLARE_ALIGNED_16 to align stack-allocated variables
gpoirier
parents: 5010
diff changeset
185 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
186 {((8 - x) * (8 - y)),
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
187 ((x) * (8 - y)),
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
188 ((8 - x) * (y)),
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
189 ((x) * (y))};
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
190 register int i;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
191 vec_u8_t fperm;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
192 const vec_s32_t vABCD = vec_ld(0, ABCD);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
193 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
194 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
195 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
196 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
197 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
198 const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
199 const vec_u16_t v6us = vec_splat_u16(6);
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
200 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
201 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
202
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
203 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
204 vec_u8_t vsrc0uc, vsrc1uc;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
205 vec_s16_t vsrc0ssH, vsrc1ssH;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
206 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
207 vec_s16_t vsrc2ssH, vsrc3ssH, psum;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
208 vec_u8_t vdst, ppsum, fsum;
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
209
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
210 if (((unsigned long)dst) % 16 == 0) {
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
211 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
212 0x14, 0x15, 0x16, 0x17,
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
213 0x08, 0x09, 0x0A, 0x0B,
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
214 0x0C, 0x0D, 0x0E, 0x0F);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
215 } else {
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
216 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
217 0x04, 0x05, 0x06, 0x07,
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
218 0x18, 0x19, 0x1A, 0x1B,
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
219 0x1C, 0x1D, 0x1E, 0x1F);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
220 }
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
221
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
222 vsrcAuc = vec_ld(0, src);
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
223
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
224 if (loadSecond)
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
225 vsrcBuc = vec_ld(16, src);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
226 vsrcperm0 = vec_lvsl(0, src);
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
227 vsrcperm1 = vec_lvsl(1, src);
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
228
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
229 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
230 if (reallyBadAlign)
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
231 vsrc1uc = vsrcBuc;
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
232 else
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
233 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
234
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
235 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
236 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
237
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
238 if (!loadSecond) {// -> !reallyBadAlign
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
239 for (i = 0 ; i < h ; i++) {
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
240
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
241
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
242 vsrcCuc = vec_ld(stride + 0, src);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
243
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
244 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
245 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
246
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
247 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
248 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
249
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
250 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
251 psum = vec_mladd(vB, vsrc1ssH, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
252 psum = vec_mladd(vC, vsrc2ssH, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
253 psum = vec_mladd(vD, vsrc3ssH, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
254 psum = vec_add(v28ss, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
255 psum = vec_sra(psum, v6us);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
256
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
257 vdst = vec_ld(0, dst);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
258 ppsum = (vec_u8_t)vec_packsu(psum, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
259 fsum = vec_perm(vdst, ppsum, fperm);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
260
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
261 vec_st(fsum, 0, dst);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
262
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
263 vsrc0ssH = vsrc2ssH;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
264 vsrc1ssH = vsrc3ssH;
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
265
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
266 dst += stride;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
267 src += stride;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
268 }
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
269 } else {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
270 vec_u8_t vsrcDuc;
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
271 for (i = 0 ; i < h ; i++) {
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
272 vsrcCuc = vec_ld(stride + 0, src);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
273 vsrcDuc = vec_ld(stride + 16, src);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
274
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
275 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
276 if (reallyBadAlign)
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
277 vsrc3uc = vsrcDuc;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
278 else
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
279 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
280
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
281 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
282 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
283
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
284 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
285 psum = vec_mladd(vB, vsrc1ssH, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
286 psum = vec_mladd(vC, vsrc2ssH, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
287 psum = vec_mladd(vD, vsrc3ssH, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
288 psum = vec_add(v28ss, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
289 psum = vec_sr(psum, v6us);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
290
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
291 vdst = vec_ld(0, dst);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
292 ppsum = (vec_u8_t)vec_pack(psum, psum);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
293 fsum = vec_perm(vdst, ppsum, fperm);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
294
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
295 vec_st(fsum, 0, dst);
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
296
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
297 vsrc0ssH = vsrc2ssH;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
298 vsrc1ssH = vsrc3ssH;
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
299
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
300 dst += stride;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
301 src += stride;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
302 }
3667
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
303 }
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
304 }
56e60411527c AltiVec version of put_no_rnd_h264_chroma_pixels_tab[0] (slightly changed version of put_h264_chroma_pixels_tab[0])
kostya
parents: 3659
diff changeset
305
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
306 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
307 const uint8_t * src2, int dst_stride,
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
308 int src_stride1, int h)
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
309 {
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
310 int i;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
311 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
312
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
313 mask_ = vec_lvsl(0, src2);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
314
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
315 for (i = 0; i < h; i++) {
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
316
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
317 tmp1 = vec_ld(i * src_stride1, src1);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
318 mask = vec_lvsl(i * src_stride1, src1);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
319 tmp2 = vec_ld(i * src_stride1 + 15, src1);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
320
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
321 a = vec_perm(tmp1, tmp2, mask);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
322
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
323 tmp1 = vec_ld(i * 16, src2);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
324 tmp2 = vec_ld(i * 16 + 15, src2);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
325
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
326 b = vec_perm(tmp1, tmp2, mask_);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
327
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
328 tmp1 = vec_ld(0, dst);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
329 mask = vec_lvsl(0, dst);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
330 tmp2 = vec_ld(15, dst);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
331
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
332 d = vec_avg(a, b);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
333
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
334 edges = vec_perm(tmp2, tmp1, mask);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
335
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
336 align = vec_lvsr(0, dst);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
337
3659
dd55fb216497 Proper fix for the corner case that would have been corrected before, praise&blame to me and exg in equal shares
lu_zero
parents: 3658
diff changeset
338 tmp2 = vec_perm(d, edges, align);
3658
2a113750d778 Revert previous commit
lu_zero
parents: 3583
diff changeset
339 tmp1 = vec_perm(edges, d, align);
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
340
3659
dd55fb216497 Proper fix for the corner case that would have been corrected before, praise&blame to me and exg in equal shares
lu_zero
parents: 3658
diff changeset
341 vec_st(tmp2, 15, dst);
3583
562758eaf7bf 10l, thanks to Emanuele Giaquinta <exg@gentoo.org> for testing and finding the issue
lu_zero
parents: 3577
diff changeset
342 vec_st(tmp1, 0 , dst);
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
343
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
344 dst += dst_stride;
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
345 }
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
346 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
347
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
348 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
349 const uint8_t * src2, int dst_stride,
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
350 int src_stride1, int h)
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
351 {
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
352 int i;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
353 vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
354
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
355 mask_ = vec_lvsl(0, src2);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
356
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
357 for (i = 0; i < h; i++) {
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
358
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
359 tmp1 = vec_ld(i * src_stride1, src1);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
360 mask = vec_lvsl(i * src_stride1, src1);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
361 tmp2 = vec_ld(i * src_stride1 + 15, src1);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
362
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
363 a = vec_perm(tmp1, tmp2, mask);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
364
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
365 tmp1 = vec_ld(i * 16, src2);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
366 tmp2 = vec_ld(i * 16 + 15, src2);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
367
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
368 b = vec_perm(tmp1, tmp2, mask_);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
369
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
370 tmp1 = vec_ld(0, dst);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
371 mask = vec_lvsl(0, dst);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
372 tmp2 = vec_ld(15, dst);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
373
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
374 d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
375
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
376 edges = vec_perm(tmp2, tmp1, mask);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
377
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
378 align = vec_lvsr(0, dst);
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
379
3659
dd55fb216497 Proper fix for the corner case that would have been corrected before, praise&blame to me and exg in equal shares
lu_zero
parents: 3658
diff changeset
380 tmp2 = vec_perm(d, edges, align);
3658
2a113750d778 Revert previous commit
lu_zero
parents: 3583
diff changeset
381 tmp1 = vec_perm(edges, d, align);
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
382
3659
dd55fb216497 Proper fix for the corner case that would have been corrected before, praise&blame to me and exg in equal shares
lu_zero
parents: 3658
diff changeset
383 vec_st(tmp2, 15, dst);
3583
562758eaf7bf 10l, thanks to Emanuele Giaquinta <exg@gentoo.org> for testing and finding the issue
lu_zero
parents: 3577
diff changeset
384 vec_st(tmp1, 0 , dst);
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
385
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
386 dst += dst_stride;
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
387 }
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
388 }
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
389
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
390 /* Implemented but could be faster
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
391 #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
392 #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
393 */
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
394
3337
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
395 H264_MC(put_, 16, altivec)
bec1eb6d3746 put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents: 3089
diff changeset
396 H264_MC(avg_, 16, altivec)
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
397
4260
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
398
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
399 /****************************************************************************
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
400 * IDCT transform:
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
401 ****************************************************************************/
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
402
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
403 #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
404 /* 1st stage */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
405 vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
406 vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
407 vz2 = vec_sra(vb1,vec_splat_u16(1)); \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
408 vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
409 vz3 = vec_sra(vb3,vec_splat_u16(1)); \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
410 vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
411 /* 2nd stage: output */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
412 va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
413 va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
414 va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
415 va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */
5094
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
416
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
417 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
418 b0 = vec_mergeh( a0, a0 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
419 b1 = vec_mergeh( a1, a0 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
420 b2 = vec_mergeh( a2, a0 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
421 b3 = vec_mergeh( a3, a0 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
422 a0 = vec_mergeh( b0, b2 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
423 a1 = vec_mergel( b0, b2 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
424 a2 = vec_mergeh( b1, b3 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
425 a3 = vec_mergel( b1, b3 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
426 b0 = vec_mergeh( a0, a2 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
427 b1 = vec_mergel( a0, a2 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
428 b2 = vec_mergeh( a1, a3 ); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
429 b3 = vec_mergel( a1, a3 )
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
430
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
431 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
432 vdst_orig = vec_ld(0, dst); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
433 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
434 vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
435 va = vec_add(va, vdst_ss); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
436 va_u8 = vec_packsu(va, zero_s16v); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
437 va_u32 = vec_splat((vec_u32_t)va_u8, 0); \
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
438 vec_ste(va_u32, element, (uint32_t*)dst);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
439
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
440 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
441 {
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
442 vec_s16_t va0, va1, va2, va3;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
443 vec_s16_t vz0, vz1, vz2, vz3;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
444 vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
445 vec_u8_t va_u8;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
446 vec_u32_t va_u32;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
447 vec_s16_t vdst_ss;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
448 const vec_u16_t v6us = vec_splat_u16(6);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
449 vec_u8_t vdst, vdst_orig;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
450 vec_u8_t vdst_mask = vec_lvsl(0, dst);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
451 int element = ((unsigned long)dst & 0xf) >> 2;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
452 LOAD_ZERO;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
453
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
454 block[0] += 32; /* add 32 as a DC-level for rounding */
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
455
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
456 vtmp0 = vec_ld(0,block);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
457 vtmp1 = vec_sld(vtmp0, vtmp0, 8);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
458 vtmp2 = vec_ld(16,block);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
459 vtmp3 = vec_sld(vtmp2, vtmp2, 8);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
460
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
461 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
462 VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
463 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
464
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
465 va0 = vec_sra(va0,v6us);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
466 va1 = vec_sra(va1,v6us);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
467 va2 = vec_sra(va2,v6us);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
468 va3 = vec_sra(va3,v6us);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
469
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
470 VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
471 dst += stride;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
472 VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
473 dst += stride;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
474 VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
475 dst += stride;
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
476 VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
477 }
ce57e3f2b2a7 h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents: 5019
diff changeset
478
4260
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
479 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
480 /* a0 = SRC(0) + SRC(4); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
481 vec_s16_t a0v = vec_add(s0, s4); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
482 /* a2 = SRC(0) - SRC(4); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
483 vec_s16_t a2v = vec_sub(s0, s4); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
484 /* a4 = (SRC(2)>>1) - SRC(6); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
485 vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
486 /* a6 = (SRC(6)>>1) + SRC(2); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
487 vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
488 /* b0 = a0 + a6; */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
489 vec_s16_t b0v = vec_add(a0v, a6v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
490 /* b2 = a2 + a4; */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
491 vec_s16_t b2v = vec_add(a2v, a4v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
492 /* b4 = a2 - a4; */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
493 vec_s16_t b4v = vec_sub(a2v, a4v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
494 /* b6 = a0 - a6; */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
495 vec_s16_t b6v = vec_sub(a0v, a6v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
496 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
497 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
498 vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
499 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
500 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
501 vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
502 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
503 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
504 vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
505 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
506 vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
507 /* b1 = (a7>>2) + a1; */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
508 vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
509 /* b3 = a3 + (a5>>2); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
510 vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
511 /* b5 = (a3>>2) - a5; */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
512 vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
513 /* b7 = a7 - (a1>>2); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
514 vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
515 /* DST(0, b0 + b7); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
516 d0 = vec_add(b0v, b7v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
517 /* DST(1, b2 + b5); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
518 d1 = vec_add(b2v, b5v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
519 /* DST(2, b4 + b3); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
520 d2 = vec_add(b4v, b3v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
521 /* DST(3, b6 + b1); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
522 d3 = vec_add(b6v, b1v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
523 /* DST(4, b6 - b1); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
524 d4 = vec_sub(b6v, b1v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
525 /* DST(5, b4 - b3); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
526 d5 = vec_sub(b4v, b3v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
527 /* DST(6, b2 - b5); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
528 d6 = vec_sub(b2v, b5v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
529 /* DST(7, b0 - b7); */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
530 d7 = vec_sub(b0v, b7v); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
531 }
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
532
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
533 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
534 /* unaligned load */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
535 vec_u8_t hv = vec_ld( 0, dest ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
536 vec_u8_t lv = vec_ld( 7, dest ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
537 vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
538 vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
4294
d8d019a1728e GCC 3.3 compile fix
gpoirier
parents: 4260
diff changeset
539 vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
4260
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
540 vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
541 vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
542 vec_u8_t edgehv; \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
543 /* unaligned store */ \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
544 vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
545 vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
546 lv = vec_sel( lv, bodyv, edgelv ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
547 vec_st( lv, 7, dest ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
548 hv = vec_ld( 0, dest ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
549 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
550 hv = vec_sel( hv, bodyv, edgehv ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
551 vec_st( hv, 0, dest ); \
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
552 }
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
553
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
554 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
555 vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
556 vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
557 vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
558
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
559 vec_u8_t perm_ldv = vec_lvsl(0, dst);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
560 vec_u8_t perm_stv = vec_lvsr(8, dst);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
561
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
562 const vec_u16_t onev = vec_splat_u16(1);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
563 const vec_u16_t twov = vec_splat_u16(2);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
564 const vec_u16_t sixv = vec_splat_u16(6);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
565
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
566 const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
4260
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
567 LOAD_ZERO;
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
568
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
569 dct[0] += 32; // rounding for the >>6 at the end
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
570
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
571 s0 = vec_ld(0x00, (int16_t*)dct);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
572 s1 = vec_ld(0x10, (int16_t*)dct);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
573 s2 = vec_ld(0x20, (int16_t*)dct);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
574 s3 = vec_ld(0x30, (int16_t*)dct);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
575 s4 = vec_ld(0x40, (int16_t*)dct);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
576 s5 = vec_ld(0x50, (int16_t*)dct);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
577 s6 = vec_ld(0x60, (int16_t*)dct);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
578 s7 = vec_ld(0x70, (int16_t*)dct);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
579
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
580 IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
581 d0, d1, d2, d3, d4, d5, d6, d7);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
582
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
583 TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 );
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
584
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
585 IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7,
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
586 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
587
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
588 ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
589 ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
590 ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
591 ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
592 ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
593 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
594 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
595 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
596 }
0407913ac6c6 Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents: 4254
diff changeset
597
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
598 #define transpose4x16(r0, r1, r2, r3) { \
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
599 register vec_u8_t r4; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
600 register vec_u8_t r5; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
601 register vec_u8_t r6; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
602 register vec_u8_t r7; \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
603 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
604 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
605 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
606 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
607 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
608 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
609 r0 = vec_mergeh(r4, r6); /*all set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
610 r1 = vec_mergel(r4, r6); /*all set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
611 r2 = vec_mergeh(r5, r7); /*all set 2*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
612 r3 = vec_mergel(r5, r7); /*all set 3*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
613 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
614
5135
724e7fad19d9 cosmetics
gpoirier
parents: 5134
diff changeset
615 static inline void write16x4(uint8_t *dst, int dst_stride,
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
616 register vec_u8_t r0, register vec_u8_t r1,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
617 register vec_u8_t r2, register vec_u8_t r3) {
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
618 DECLARE_ALIGNED_16(unsigned char, result[64]);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
619 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
620 int int_dst_stride = dst_stride/4;
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
621
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
622 vec_st(r0, 0, result);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
623 vec_st(r1, 16, result);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
624 vec_st(r2, 32, result);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
625 vec_st(r3, 48, result);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
626 /* FIXME: there has to be a better way!!!! */
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
627 *dst_int = *src_int;
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
628 *(dst_int+ int_dst_stride) = *(src_int + 1);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
629 *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
630 *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
631 *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
632 *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
633 *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
634 *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
635 *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
636 *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
637 *(dst_int+10*int_dst_stride) = *(src_int + 10);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
638 *(dst_int+11*int_dst_stride) = *(src_int + 11);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
639 *(dst_int+12*int_dst_stride) = *(src_int + 12);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
640 *(dst_int+13*int_dst_stride) = *(src_int + 13);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
641 *(dst_int+14*int_dst_stride) = *(src_int + 14);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
642 *(dst_int+15*int_dst_stride) = *(src_int + 15);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
643 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
644
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
645 /** \brief performs a 6x16 transpose of data in src, and stores it to dst
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
646 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
647 out of unaligned_load() */
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
648 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
649 register vec_u8_t r0 = unaligned_load(0, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
650 register vec_u8_t r1 = unaligned_load( src_stride, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
651 register vec_u8_t r2 = unaligned_load(2* src_stride, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
652 register vec_u8_t r3 = unaligned_load(3* src_stride, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
653 register vec_u8_t r4 = unaligned_load(4* src_stride, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
654 register vec_u8_t r5 = unaligned_load(5* src_stride, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
655 register vec_u8_t r6 = unaligned_load(6* src_stride, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
656 register vec_u8_t r7 = unaligned_load(7* src_stride, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
657 register vec_u8_t r14 = unaligned_load(14*src_stride, src); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
658 register vec_u8_t r15 = unaligned_load(15*src_stride, src); \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
659 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
660 r8 = unaligned_load( 8*src_stride, src); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
661 r9 = unaligned_load( 9*src_stride, src); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
662 r10 = unaligned_load(10*src_stride, src); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
663 r11 = unaligned_load(11*src_stride, src); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
664 r12 = unaligned_load(12*src_stride, src); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
665 r13 = unaligned_load(13*src_stride, src); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
666 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
667 /*Merge first pairs*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
668 r0 = vec_mergeh(r0, r8); /*0, 8*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
669 r1 = vec_mergeh(r1, r9); /*1, 9*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
670 r2 = vec_mergeh(r2, r10); /*2,10*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
671 r3 = vec_mergeh(r3, r11); /*3,11*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
672 r4 = vec_mergeh(r4, r12); /*4,12*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
673 r5 = vec_mergeh(r5, r13); /*5,13*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
674 r6 = vec_mergeh(r6, r14); /*6,14*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
675 r7 = vec_mergeh(r7, r15); /*7,15*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
676 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
677 /*Merge second pairs*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
678 r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
679 r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
680 r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
681 r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
682 r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
683 r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
684 r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
685 r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
686 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
687 /*Third merge*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
688 r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
689 r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
690 r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
691 r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
692 r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
693 r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
694 /* Don't need to compute 3 and 7*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
695 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
696 /*Final merge*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
697 r8 = vec_mergeh(r0, r4); /*all set 0*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
698 r9 = vec_mergel(r0, r4); /*all set 1*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
699 r10 = vec_mergeh(r1, r5); /*all set 2*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
700 r11 = vec_mergel(r1, r5); /*all set 3*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
701 r12 = vec_mergeh(r2, r6); /*all set 4*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
702 r13 = vec_mergel(r2, r6); /*all set 5*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
703 /* Don't need to compute 14 and 15*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
704 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
705 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
706
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
707 // out: o = |x-y| < a
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
708 static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
709 register vec_u8_t y,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
710 register vec_u8_t a) {
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
711
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
712 register vec_u8_t diff = vec_subs(x, y);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
713 register vec_u8_t diffneg = vec_subs(y, x);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
714 register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
715 o = (vec_u8_t)vec_cmplt(o, a);
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
716 return o;
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
717 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
718
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
719 static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
720 register vec_u8_t p1,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
721 register vec_u8_t q0,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
722 register vec_u8_t q1,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
723 register vec_u8_t alpha,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
724 register vec_u8_t beta) {
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
725
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
726 register vec_u8_t mask;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
727 register vec_u8_t tempmask;
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
728
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
729 mask = diff_lt_altivec(p0, q0, alpha);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
730 tempmask = diff_lt_altivec(p1, p0, beta);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
731 mask = vec_and(mask, tempmask);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
732 tempmask = diff_lt_altivec(q1, q0, beta);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
733 mask = vec_and(mask, tempmask);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
734
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
735 return mask;
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
736 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
737
5165
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
738 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
739 static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
740 register vec_u8_t p1,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
741 register vec_u8_t p2,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
742 register vec_u8_t q0,
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
743 register vec_u8_t tc0) {
5164
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
744
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
745 register vec_u8_t average = vec_avg(p0, q0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
746 register vec_u8_t temp;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
747 register vec_u8_t uncliped;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
748 register vec_u8_t ones;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
749 register vec_u8_t max;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
750 register vec_u8_t min;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
751 register vec_u8_t newp1;
5164
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
752
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
753 temp = vec_xor(average, p2);
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
754 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
755 ones = vec_splat_u8(1);
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
756 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
757 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
758 max = vec_adds(p1, tc0);
830b9dd36fef convert h264_deblock_q1 to an inline function.
gpoirier
parents: 5159
diff changeset
759 min = vec_subs(p1, tc0);
5165
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
760 newp1 = vec_max(min, uncliped);
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
761 newp1 = vec_min(max, newp1);
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
762 return newp1;
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
763 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
764
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
765 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
766 \
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
767 const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
768 \
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
769 register vec_u8_t pq0bit = vec_xor(p0,q0); \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
770 register vec_u8_t q1minus; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
771 register vec_u8_t p0minus; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
772 register vec_u8_t stage1; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
773 register vec_u8_t stage2; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
774 register vec_u8_t vec160; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
775 register vec_u8_t delta; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
776 register vec_u8_t deltaneg; \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
777 \
5159
883a5619f52f Use a faster way to compute 255-val: Instead of creating a vector of
gpoirier
parents: 5145
diff changeset
778 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
779 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
780 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
5159
883a5619f52f Use a faster way to compute 255-val: Instead of creating a vector of
gpoirier
parents: 5145
diff changeset
781 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
782 stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
783 pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
784 stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
785 stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
786 vec160 = vec_ld(0, &A0v); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
787 deltaneg = vec_subs(vec160, stage2); /* -d */ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
788 delta = vec_subs(stage2, vec160); /* d */ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
789 deltaneg = vec_min(tc0masked, deltaneg); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
790 delta = vec_min(tc0masked, delta); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
791 p0 = vec_subs(p0, deltaneg); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
792 q0 = vec_subs(q0, delta); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
793 p0 = vec_adds(p0, delta); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
794 q0 = vec_adds(q0, deltaneg); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
795 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
796
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
797 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
798 DECLARE_ALIGNED_16(unsigned char, temp[16]); \
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
799 register vec_u8_t alphavec; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
800 register vec_u8_t betavec; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
801 register vec_u8_t mask; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
802 register vec_u8_t p1mask; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
803 register vec_u8_t q1mask; \
5167
b59514a8d239 restore GCC3 support
gpoirier
parents: 5166
diff changeset
804 register vector signed char tc0vec; \
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
805 register vec_u8_t finaltc0; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
806 register vec_u8_t tc0masked; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
807 register vec_u8_t newp1; \
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
808 register vec_u8_t newq1; \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
809 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
810 temp[0] = alpha; \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
811 temp[1] = beta; \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
812 alphavec = vec_ld(0, temp); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
813 betavec = vec_splat(alphavec, 0x1); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
814 alphavec = vec_splat(alphavec, 0x0); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
815 mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
816 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
817 *((int *)temp) = *((int *)tc0); \
5167
b59514a8d239 restore GCC3 support
gpoirier
parents: 5166
diff changeset
818 tc0vec = vec_ld(0, (signed char*)temp); \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
819 tc0vec = vec_mergeh(tc0vec, tc0vec); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
820 tc0vec = vec_mergeh(tc0vec, tc0vec); \
5165
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
821 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
822 finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
823 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
824 p1mask = diff_lt_altivec(p2, p0, betavec); \
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
825 p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
826 tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
827 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
5165
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
828 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
829 /*end if*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
830 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
831 q1mask = diff_lt_altivec(q2, q0, betavec); \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
832 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
833 tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
834 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
5165
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
835 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
836 /*end if*/ \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
837 \
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
838 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
5165
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
839 p1 = newp1; \
c99fa49eaa80 part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents: 5164
diff changeset
840 q1 = newq1; \
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
841 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
842
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
843 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
844
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
845 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
846 register vec_u8_t p2 = vec_ld(-3*stride, pix);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
847 register vec_u8_t p1 = vec_ld(-2*stride, pix);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
848 register vec_u8_t p0 = vec_ld(-1*stride, pix);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
849 register vec_u8_t q0 = vec_ld(0, pix);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
850 register vec_u8_t q1 = vec_ld(stride, pix);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
851 register vec_u8_t q2 = vec_ld(2*stride, pix);
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
852 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
853 vec_st(p1, -2*stride, pix);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
854 vec_st(p0, -1*stride, pix);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
855 vec_st(q0, 0, pix);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
856 vec_st(q1, stride, pix);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
857 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
858 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
859
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
860 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
861
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5167
diff changeset
862 register vec_u8_t line0, line1, line2, line3, line4, line5;
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
863 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
5119
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
864 return;
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
865 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
866 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
867 transpose4x16(line1, line2, line3, line4);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
868 write16x4(pix-2, stride, line1, line2, line3, line4);
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
869 }
ad0c45e0008c Altivec version of h264_(h|v)_loop_filter_luma
gpoirier
parents: 5094
diff changeset
870
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
871 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
872
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
873 if (has_altivec()) {
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
874 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
875 c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
876 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
877 c->h264_idct_add = ff_h264_idct_add_altivec;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
878 c->h264_idct8_add = ff_h264_idct8_add_altivec;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
879 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
880 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
881
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
882 #define dspfunc(PFX, IDX, NUM) \
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
883 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
884 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
885 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
886 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
887 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
888 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
889 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
890 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
891 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
892 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
893 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
894 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
895 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
896 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
897 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
898 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
899
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
900 dspfunc(put_h264_qpel, 0, 16);
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
901 dspfunc(avg_h264_qpel, 0, 16);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
902 #undef dspfunc
5586
f065fc609145 whitespace/indentation cosmetics
diego
parents: 5585
diff changeset
903 }
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
904 }