Mercurial > libavcodec.hg
annotate ppc/h264_altivec.c @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | 3cd4cd0509cd |
children | 06abedae2906 |
rev | line source |
---|---|
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
1 /* |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
3 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3667
diff
changeset
|
4 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3667
diff
changeset
|
5 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3667
diff
changeset
|
6 * FFmpeg is free software; you can redistribute it and/or |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
7 * modify it under the terms of the GNU Lesser General Public |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
8 * License as published by the Free Software Foundation; either |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3667
diff
changeset
|
9 * version 2.1 of the License, or (at your option) any later version. |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
10 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3667
diff
changeset
|
11 * FFmpeg is distributed in the hope that it will be useful, |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
14 * Lesser General Public License for more details. |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
15 * |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
16 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3667
diff
changeset
|
17 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
19 */ |
2967 | 20 |
6763 | 21 #include "libavcodec/dsputil.h" |
8461
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
22 #include "libavcodec/h264data.h" |
11499 | 23 #include "libavcodec/h264dsp.h" |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
24 |
6077 | 25 #include "dsputil_altivec.h" |
5750
09f99af1db40
Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents:
5586
diff
changeset
|
26 #include "util_altivec.h" |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
27 #include "types_altivec.h" |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
28 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
29 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
30 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
31 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
32 #define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
33 #define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec |
9444 | 34 #define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
35 #define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
36 #define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
37 #define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
38 #define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
39 #define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
40 #define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
41 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num |
3577 | 42 #include "h264_template_altivec.c" |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
43 #undef OP_U8_ALTIVEC |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
44 #undef PREFIX_h264_chroma_mc8_altivec |
9444 | 45 #undef PREFIX_no_rnd_vc1_chroma_mc8_altivec |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
46 #undef PREFIX_h264_chroma_mc8_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
47 #undef PREFIX_h264_qpel16_h_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
48 #undef PREFIX_h264_qpel16_h_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
49 #undef PREFIX_h264_qpel16_v_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
50 #undef PREFIX_h264_qpel16_v_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
51 #undef PREFIX_h264_qpel16_hv_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
52 #undef PREFIX_h264_qpel16_hv_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
53 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
54 #define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
55 #define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec |
9444 | 56 #define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
57 #define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
58 #define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
59 #define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
60 #define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
61 #define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
62 #define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
63 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num |
3577 | 64 #include "h264_template_altivec.c" |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
65 #undef OP_U8_ALTIVEC |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
66 #undef PREFIX_h264_chroma_mc8_altivec |
9444 | 67 #undef PREFIX_no_rnd_vc1_chroma_mc8_altivec |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
68 #undef PREFIX_h264_chroma_mc8_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
69 #undef PREFIX_h264_qpel16_h_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
70 #undef PREFIX_h264_qpel16_h_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
71 #undef PREFIX_h264_qpel16_v_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
72 #undef PREFIX_h264_qpel16_v_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
73 #undef PREFIX_h264_qpel16_hv_lowpass_altivec |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
74 #undef PREFIX_h264_qpel16_hv_lowpass_num |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
75 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
76 #define H264_MC(OPNAME, SIZE, CODETYPE) \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
77 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
78 OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
79 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
80 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
81 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \ |
11369 | 82 DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
83 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
84 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
85 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
86 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
87 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
88 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
89 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
90 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
91 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 92 DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
93 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
94 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
95 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
96 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
97 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 98 DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
99 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
100 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
101 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
102 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
103 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
104 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
105 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
106 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
107 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 108 DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\ |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
109 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
110 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
111 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
112 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
113 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 114 DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
115 DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
116 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
117 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
118 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
119 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
120 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
121 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 122 DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
123 DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
124 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
125 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
126 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
127 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
128 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
129 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 130 DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
131 DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
132 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
133 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
134 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
135 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
136 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
137 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 138 DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
139 DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
140 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
141 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
142 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
143 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
144 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
145 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 146 DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
147 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
148 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
149 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
150 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 151 DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
152 DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ | |
153 DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
154 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
155 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
156 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
157 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
158 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
159 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 160 DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\ |
161 DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ | |
162 DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
163 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
164 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
165 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
166 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
167 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
168 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 169 DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ |
170 DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ | |
171 DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
172 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
173 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
174 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
175 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
176 \ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
177 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ |
11369 | 178 DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\ |
179 DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\ | |
180 DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\ | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
181 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
182 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
183 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
184 }\ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
185 |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
186 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
187 const uint8_t * src2, int dst_stride, |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
188 int src_stride1, int h) |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
189 { |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
190 int i; |
8494 | 191 vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
192 |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
193 mask_ = vec_lvsl(0, src2); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
194 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
195 for (i = 0; i < h; i++) { |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
196 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
197 tmp1 = vec_ld(i * src_stride1, src1); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
198 mask = vec_lvsl(i * src_stride1, src1); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
199 tmp2 = vec_ld(i * src_stride1 + 15, src1); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
200 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
201 a = vec_perm(tmp1, tmp2, mask); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
202 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
203 tmp1 = vec_ld(i * 16, src2); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
204 tmp2 = vec_ld(i * 16 + 15, src2); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
205 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
206 b = vec_perm(tmp1, tmp2, mask_); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
207 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
208 tmp1 = vec_ld(0, dst); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
209 mask = vec_lvsl(0, dst); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
210 tmp2 = vec_ld(15, dst); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
211 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
212 d = vec_avg(a, b); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
213 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
214 edges = vec_perm(tmp2, tmp1, mask); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
215 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
216 align = vec_lvsr(0, dst); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
217 |
3659
dd55fb216497
Proper fix for the corner case that would have been corrected before, praise&blame to me and exg in equal shares
lu_zero
parents:
3658
diff
changeset
|
218 tmp2 = vec_perm(d, edges, align); |
3658 | 219 tmp1 = vec_perm(edges, d, align); |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
220 |
3659
dd55fb216497
Proper fix for the corner case that would have been corrected before, praise&blame to me and exg in equal shares
lu_zero
parents:
3658
diff
changeset
|
221 vec_st(tmp2, 15, dst); |
3583
562758eaf7bf
10l, thanks to Emanuele Giaquinta <exg@gentoo.org> for testing and finding the issue
lu_zero
parents:
3577
diff
changeset
|
222 vec_st(tmp1, 0 , dst); |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
223 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
224 dst += dst_stride; |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
225 } |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
226 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
227 |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
228 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
229 const uint8_t * src2, int dst_stride, |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
230 int src_stride1, int h) |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
231 { |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
232 int i; |
8494 | 233 vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align; |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
234 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
235 mask_ = vec_lvsl(0, src2); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
236 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
237 for (i = 0; i < h; i++) { |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
238 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
239 tmp1 = vec_ld(i * src_stride1, src1); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
240 mask = vec_lvsl(i * src_stride1, src1); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
241 tmp2 = vec_ld(i * src_stride1 + 15, src1); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
242 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
243 a = vec_perm(tmp1, tmp2, mask); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
244 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
245 tmp1 = vec_ld(i * 16, src2); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
246 tmp2 = vec_ld(i * 16 + 15, src2); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
247 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
248 b = vec_perm(tmp1, tmp2, mask_); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
249 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
250 tmp1 = vec_ld(0, dst); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
251 mask = vec_lvsl(0, dst); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
252 tmp2 = vec_ld(15, dst); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
253 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
254 d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
255 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
256 edges = vec_perm(tmp2, tmp1, mask); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
257 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
258 align = vec_lvsr(0, dst); |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
259 |
3659
dd55fb216497
Proper fix for the corner case that would have been corrected before, praise&blame to me and exg in equal shares
lu_zero
parents:
3658
diff
changeset
|
260 tmp2 = vec_perm(d, edges, align); |
3658 | 261 tmp1 = vec_perm(edges, d, align); |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
262 |
3659
dd55fb216497
Proper fix for the corner case that would have been corrected before, praise&blame to me and exg in equal shares
lu_zero
parents:
3658
diff
changeset
|
263 vec_st(tmp2, 15, dst); |
3583
562758eaf7bf
10l, thanks to Emanuele Giaquinta <exg@gentoo.org> for testing and finding the issue
lu_zero
parents:
3577
diff
changeset
|
264 vec_st(tmp1, 0 , dst); |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
265 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
266 dst += dst_stride; |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
267 } |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
268 } |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
269 |
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
270 /* Implemented but could be faster |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
271 #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h) |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
272 #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) |
3337
bec1eb6d3746
put_pixels16_l2_altivec and avg_pixels16_l2_altivec
lu_zero
parents:
3089
diff
changeset
|
273 */ |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
274 |
7333
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
275 H264_MC(put_, 16, altivec) |
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
276 H264_MC(avg_, 16, altivec) |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
277 |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
278 |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
279 /**************************************************************************** |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
280 * IDCT transform: |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
281 ****************************************************************************/ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
282 |
5586 | 283 #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ |
284 /* 1st stage */ \ | |
285 vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ | |
286 vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ | |
287 vz2 = vec_sra(vb1,vec_splat_u16(1)); \ | |
288 vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ | |
289 vz3 = vec_sra(vb3,vec_splat_u16(1)); \ | |
290 vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ | |
291 /* 2nd stage: output */ \ | |
292 va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ | |
293 va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ | |
294 va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ | |
295 va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ | |
5094
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
296 |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
297 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
298 b0 = vec_mergeh( a0, a0 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
299 b1 = vec_mergeh( a1, a0 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
300 b2 = vec_mergeh( a2, a0 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
301 b3 = vec_mergeh( a3, a0 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
302 a0 = vec_mergeh( b0, b2 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
303 a1 = vec_mergel( b0, b2 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
304 a2 = vec_mergeh( b1, b3 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
305 a3 = vec_mergel( b1, b3 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
306 b0 = vec_mergeh( a0, a2 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
307 b1 = vec_mergel( a0, a2 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
308 b2 = vec_mergeh( a1, a3 ); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
309 b3 = vec_mergel( a1, a3 ) |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
310 |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
311 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
312 vdst_orig = vec_ld(0, dst); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
313 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ |
8494 | 314 vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \ |
5094
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
315 va = vec_add(va, vdst_ss); \ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
316 va_u8 = vec_packsu(va, zero_s16v); \ |
8494 | 317 va_u32 = vec_splat((vec_u32)va_u8, 0); \ |
5094
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
318 vec_ste(va_u32, element, (uint32_t*)dst); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
319 |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
320 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
321 { |
8494 | 322 vec_s16 va0, va1, va2, va3; |
323 vec_s16 vz0, vz1, vz2, vz3; | |
324 vec_s16 vtmp0, vtmp1, vtmp2, vtmp3; | |
325 vec_u8 va_u8; | |
326 vec_u32 va_u32; | |
327 vec_s16 vdst_ss; | |
328 const vec_u16 v6us = vec_splat_u16(6); | |
329 vec_u8 vdst, vdst_orig; | |
330 vec_u8 vdst_mask = vec_lvsl(0, dst); | |
5094
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
331 int element = ((unsigned long)dst & 0xf) >> 2; |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
332 LOAD_ZERO; |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
333 |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
334 block[0] += 32; /* add 32 as a DC-level for rounding */ |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
335 |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
336 vtmp0 = vec_ld(0,block); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
337 vtmp1 = vec_sld(vtmp0, vtmp0, 8); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
338 vtmp2 = vec_ld(16,block); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
339 vtmp3 = vec_sld(vtmp2, vtmp2, 8); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
340 |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
341 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
342 VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
343 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
344 |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
345 va0 = vec_sra(va0,v6us); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
346 va1 = vec_sra(va1,v6us); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
347 va2 = vec_sra(va2,v6us); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
348 va3 = vec_sra(va3,v6us); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
349 |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
350 VEC_LOAD_U8_ADD_S16_STORE_U8(va0); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
351 dst += stride; |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
352 VEC_LOAD_U8_ADD_S16_STORE_U8(va1); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
353 dst += stride; |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
354 VEC_LOAD_U8_ADD_S16_STORE_U8(va2); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
355 dst += stride; |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
356 VEC_LOAD_U8_ADD_S16_STORE_U8(va3); |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
357 } |
ce57e3f2b2a7
h264_idct_add_altivec, based on an old patch from Mauricio Alvarez <alvarezATac.upc.edu>, polished by David Conrad <umovimusATgmail.com>
lu_zero
parents:
5019
diff
changeset
|
358 |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
359 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
360 /* a0 = SRC(0) + SRC(4); */ \ |
8494 | 361 vec_s16 a0v = vec_add(s0, s4); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
362 /* a2 = SRC(0) - SRC(4); */ \ |
8494 | 363 vec_s16 a2v = vec_sub(s0, s4); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
364 /* a4 = (SRC(2)>>1) - SRC(6); */ \ |
8494 | 365 vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
366 /* a6 = (SRC(6)>>1) + SRC(2); */ \ |
8494 | 367 vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
368 /* b0 = a0 + a6; */ \ |
8494 | 369 vec_s16 b0v = vec_add(a0v, a6v); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
370 /* b2 = a2 + a4; */ \ |
8494 | 371 vec_s16 b2v = vec_add(a2v, a4v); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
372 /* b4 = a2 - a4; */ \ |
8494 | 373 vec_s16 b4v = vec_sub(a2v, a4v); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
374 /* b6 = a0 - a6; */ \ |
8494 | 375 vec_s16 b6v = vec_sub(a0v, a6v); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
376 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
377 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ |
8494 | 378 vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
379 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
380 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ |
8494 | 381 vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
382 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
383 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ |
8494 | 384 vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
385 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ |
8494 | 386 vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
387 /* b1 = (a7>>2) + a1; */ \ |
8494 | 388 vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
389 /* b3 = a3 + (a5>>2); */ \ |
8494 | 390 vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
391 /* b5 = (a3>>2) - a5; */ \ |
8494 | 392 vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
393 /* b7 = a7 - (a1>>2); */ \ |
8494 | 394 vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
395 /* DST(0, b0 + b7); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
396 d0 = vec_add(b0v, b7v); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
397 /* DST(1, b2 + b5); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
398 d1 = vec_add(b2v, b5v); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
399 /* DST(2, b4 + b3); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
400 d2 = vec_add(b4v, b3v); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
401 /* DST(3, b6 + b1); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
402 d3 = vec_add(b6v, b1v); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
403 /* DST(4, b6 - b1); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
404 d4 = vec_sub(b6v, b1v); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
405 /* DST(5, b4 - b3); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
406 d5 = vec_sub(b4v, b3v); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
407 /* DST(6, b2 - b5); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
408 d6 = vec_sub(b2v, b5v); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
409 /* DST(7, b0 - b7); */ \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
410 d7 = vec_sub(b0v, b7v); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
411 } |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
412 |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
413 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
414 /* unaligned load */ \ |
8494 | 415 vec_u8 hv = vec_ld( 0, dest ); \ |
416 vec_u8 lv = vec_ld( 7, dest ); \ | |
417 vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \ | |
418 vec_s16 idct_sh6 = vec_sra(idctv, sixv); \ | |
419 vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \ | |
420 vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \ | |
421 vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \ | |
422 vec_u8 edgehv; \ | |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
423 /* unaligned store */ \ |
8494 | 424 vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ |
425 vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ | |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
426 lv = vec_sel( lv, bodyv, edgelv ); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
427 vec_st( lv, 7, dest ); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
428 hv = vec_ld( 0, dest ); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
429 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
430 hv = vec_sel( hv, bodyv, edgehv ); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
431 vec_st( hv, 0, dest ); \ |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
432 } |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
433 |
11382
50415a8f1451
PPC: move prototypes to headers and make some functions static
mru
parents:
11369
diff
changeset
|
434 static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { |
8494 | 435 vec_s16 s0, s1, s2, s3, s4, s5, s6, s7; |
436 vec_s16 d0, d1, d2, d3, d4, d5, d6, d7; | |
437 vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; | |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
438 |
8494 | 439 vec_u8 perm_ldv = vec_lvsl(0, dst); |
440 vec_u8 perm_stv = vec_lvsr(8, dst); | |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
441 |
8494 | 442 const vec_u16 onev = vec_splat_u16(1); |
443 const vec_u16 twov = vec_splat_u16(2); | |
444 const vec_u16 sixv = vec_splat_u16(6); | |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
445 |
8494 | 446 const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; |
4260
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
447 LOAD_ZERO; |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
448 |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
449 dct[0] += 32; // rounding for the >>6 at the end |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
450 |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
451 s0 = vec_ld(0x00, (int16_t*)dct); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
452 s1 = vec_ld(0x10, (int16_t*)dct); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
453 s2 = vec_ld(0x20, (int16_t*)dct); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
454 s3 = vec_ld(0x30, (int16_t*)dct); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
455 s4 = vec_ld(0x40, (int16_t*)dct); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
456 s5 = vec_ld(0x50, (int16_t*)dct); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
457 s6 = vec_ld(0x60, (int16_t*)dct); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
458 s7 = vec_ld(0x70, (int16_t*)dct); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
459 |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
460 IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
461 d0, d1, d2, d3, d4, d5, d6, d7); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
462 |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
463 TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
464 |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
465 IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
466 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
467 |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
468 ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
469 ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
470 ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
471 ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
472 ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
473 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
474 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
475 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
476 } |
0407913ac6c6
Add IDCT8 routine in Altivec. Patch by yours truely with Linux fixes by Luca Barbato
gpoirier
parents:
4254
diff
changeset
|
477 |
8530 | 478 static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size) |
479 { | |
480 vec_s16 dc16; | |
481 vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; | |
482 LOAD_ZERO; | |
11369 | 483 DECLARE_ALIGNED(16, int, dc); |
8530 | 484 int i; |
485 | |
486 dc = (block[0] + 32) >> 6; | |
487 dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); | |
488 | |
489 if (size == 4) | |
490 dc16 = vec_sld(dc16, zero_s16v, 8); | |
491 dcplus = vec_packsu(dc16, zero_s16v); | |
492 dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); | |
493 | |
494 aligner = vec_lvsr(0, dst); | |
495 dcplus = vec_perm(dcplus, dcplus, aligner); | |
496 dcminus = vec_perm(dcminus, dcminus, aligner); | |
497 | |
498 for (i = 0; i < size; i += 4) { | |
499 v0 = vec_ld(0, dst+0*stride); | |
500 v1 = vec_ld(0, dst+1*stride); | |
501 v2 = vec_ld(0, dst+2*stride); | |
502 v3 = vec_ld(0, dst+3*stride); | |
503 | |
504 v0 = vec_adds(v0, dcplus); | |
505 v1 = vec_adds(v1, dcplus); | |
506 v2 = vec_adds(v2, dcplus); | |
507 v3 = vec_adds(v3, dcplus); | |
508 | |
509 v0 = vec_subs(v0, dcminus); | |
510 v1 = vec_subs(v1, dcminus); | |
511 v2 = vec_subs(v2, dcminus); | |
512 v3 = vec_subs(v3, dcminus); | |
513 | |
514 vec_st(v0, 0, dst+0*stride); | |
515 vec_st(v1, 0, dst+1*stride); | |
516 vec_st(v2, 0, dst+2*stride); | |
517 vec_st(v3, 0, dst+3*stride); | |
518 | |
519 dst += 4*stride; | |
8461
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
520 } |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
521 } |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
522 |
8530 | 523 static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) |
524 { | |
525 h264_idct_dc_add_internal(dst, block, stride, 4); | |
526 } | |
527 | |
528 static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride) | |
529 { | |
530 h264_idct_dc_add_internal(dst, block, stride, 8); | |
531 } | |
532 | |
8544
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
533 static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
534 int i; |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
535 for(i=0; i<16; i++){ |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
536 int nnz = nnzc[ scan8[i] ]; |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
537 if(nnz){ |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
538 if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
539 else ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
540 } |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
541 } |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
542 } |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
543 |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
544 static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
545 int i; |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
546 for(i=0; i<16; i++){ |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
547 if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride); |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
548 else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride); |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
549 } |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
550 } |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
551 |
8461
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
552 static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
553 int i; |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
554 for(i=0; i<16; i+=4){ |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
555 int nnz = nnzc[ scan8[i] ]; |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
556 if(nnz){ |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
557 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride); |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
558 else ff_h264_idct8_add_altivec (dst + block_offset[i], block + i*16, stride); |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
559 } |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
560 } |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
561 } |
11307ea31e57
Disable usage of ff_h264_idct_add_altivec since AltiVec versions of h264_idct_add16,
gpoirier
parents:
7376
diff
changeset
|
562 |
8544
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
563 static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
564 int i; |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
565 for(i=16; i<16+8; i++){ |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
566 if(nnzc[ scan8[i] ]) |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
567 ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
568 else if(block[i*16]) |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
569 h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
570 } |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
571 } |
0ae8629baf6f
Add AltiVec versions of h264_idct_add(8|16|16intra),
gpoirier
parents:
8541
diff
changeset
|
572 |
5119 | 573 #define transpose4x16(r0, r1, r2, r3) { \ |
8494 | 574 register vec_u8 r4; \ |
575 register vec_u8 r5; \ | |
576 register vec_u8 r6; \ | |
577 register vec_u8 r7; \ | |
5119 | 578 \ |
579 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ | |
580 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ | |
581 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ | |
582 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ | |
583 \ | |
584 r0 = vec_mergeh(r4, r6); /*all set 0*/ \ | |
585 r1 = vec_mergel(r4, r6); /*all set 1*/ \ | |
586 r2 = vec_mergeh(r5, r7); /*all set 2*/ \ | |
587 r3 = vec_mergel(r5, r7); /*all set 3*/ \ | |
588 } | |
589 | |
5135 | 590 static inline void write16x4(uint8_t *dst, int dst_stride, |
8494 | 591 register vec_u8 r0, register vec_u8 r1, |
592 register vec_u8 r2, register vec_u8 r3) { | |
11369 | 593 DECLARE_ALIGNED(16, unsigned char, result)[64]; |
5119 | 594 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; |
595 int int_dst_stride = dst_stride/4; | |
596 | |
597 vec_st(r0, 0, result); | |
598 vec_st(r1, 16, result); | |
599 vec_st(r2, 32, result); | |
600 vec_st(r3, 48, result); | |
601 /* FIXME: there has to be a better way!!!! */ | |
602 *dst_int = *src_int; | |
603 *(dst_int+ int_dst_stride) = *(src_int + 1); | |
604 *(dst_int+ 2*int_dst_stride) = *(src_int + 2); | |
605 *(dst_int+ 3*int_dst_stride) = *(src_int + 3); | |
606 *(dst_int+ 4*int_dst_stride) = *(src_int + 4); | |
607 *(dst_int+ 5*int_dst_stride) = *(src_int + 5); | |
608 *(dst_int+ 6*int_dst_stride) = *(src_int + 6); | |
609 *(dst_int+ 7*int_dst_stride) = *(src_int + 7); | |
610 *(dst_int+ 8*int_dst_stride) = *(src_int + 8); | |
611 *(dst_int+ 9*int_dst_stride) = *(src_int + 9); | |
612 *(dst_int+10*int_dst_stride) = *(src_int + 10); | |
613 *(dst_int+11*int_dst_stride) = *(src_int + 11); | |
614 *(dst_int+12*int_dst_stride) = *(src_int + 12); | |
615 *(dst_int+13*int_dst_stride) = *(src_int + 13); | |
616 *(dst_int+14*int_dst_stride) = *(src_int + 14); | |
617 *(dst_int+15*int_dst_stride) = *(src_int + 15); | |
618 } | |
619 | |
620 /** \brief performs a 6x16 transpose of data in src, and stores it to dst | |
621 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing | |
622 out of unaligned_load() */ | |
623 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ | |
8494 | 624 register vec_u8 r0 = unaligned_load(0, src); \ |
625 register vec_u8 r1 = unaligned_load( src_stride, src); \ | |
626 register vec_u8 r2 = unaligned_load(2* src_stride, src); \ | |
627 register vec_u8 r3 = unaligned_load(3* src_stride, src); \ | |
628 register vec_u8 r4 = unaligned_load(4* src_stride, src); \ | |
629 register vec_u8 r5 = unaligned_load(5* src_stride, src); \ | |
630 register vec_u8 r6 = unaligned_load(6* src_stride, src); \ | |
631 register vec_u8 r7 = unaligned_load(7* src_stride, src); \ | |
632 register vec_u8 r14 = unaligned_load(14*src_stride, src); \ | |
633 register vec_u8 r15 = unaligned_load(15*src_stride, src); \ | |
5119 | 634 \ |
635 r8 = unaligned_load( 8*src_stride, src); \ | |
636 r9 = unaligned_load( 9*src_stride, src); \ | |
637 r10 = unaligned_load(10*src_stride, src); \ | |
638 r11 = unaligned_load(11*src_stride, src); \ | |
639 r12 = unaligned_load(12*src_stride, src); \ | |
640 r13 = unaligned_load(13*src_stride, src); \ | |
641 \ | |
642 /*Merge first pairs*/ \ | |
643 r0 = vec_mergeh(r0, r8); /*0, 8*/ \ | |
644 r1 = vec_mergeh(r1, r9); /*1, 9*/ \ | |
645 r2 = vec_mergeh(r2, r10); /*2,10*/ \ | |
646 r3 = vec_mergeh(r3, r11); /*3,11*/ \ | |
647 r4 = vec_mergeh(r4, r12); /*4,12*/ \ | |
648 r5 = vec_mergeh(r5, r13); /*5,13*/ \ | |
649 r6 = vec_mergeh(r6, r14); /*6,14*/ \ | |
650 r7 = vec_mergeh(r7, r15); /*7,15*/ \ | |
651 \ | |
652 /*Merge second pairs*/ \ | |
653 r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \ | |
654 r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \ | |
655 r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \ | |
656 r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \ | |
657 r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \ | |
658 r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \ | |
659 r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \ | |
660 r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ | |
661 \ | |
662 /*Third merge*/ \ | |
7333
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
663 r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ |
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
664 r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ |
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
6763
diff
changeset
|
665 r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ |
5119 | 666 r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ |
667 r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ | |
668 r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ | |
669 /* Don't need to compute 3 and 7*/ \ | |
670 \ | |
671 /*Final merge*/ \ | |
672 r8 = vec_mergeh(r0, r4); /*all set 0*/ \ | |
673 r9 = vec_mergel(r0, r4); /*all set 1*/ \ | |
674 r10 = vec_mergeh(r1, r5); /*all set 2*/ \ | |
675 r11 = vec_mergel(r1, r5); /*all set 3*/ \ | |
676 r12 = vec_mergeh(r2, r6); /*all set 4*/ \ | |
677 r13 = vec_mergel(r2, r6); /*all set 5*/ \ | |
678 /* Don't need to compute 14 and 15*/ \ | |
679 \ | |
680 } | |
681 | |
682 // out: o = |x-y| < a | |
8494 | 683 static inline vec_u8 diff_lt_altivec ( register vec_u8 x, |
684 register vec_u8 y, | |
685 register vec_u8 a) { | |
5119 | 686 |
8494 | 687 register vec_u8 diff = vec_subs(x, y); |
688 register vec_u8 diffneg = vec_subs(y, x); | |
689 register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */ | |
690 o = (vec_u8)vec_cmplt(o, a); | |
5119 | 691 return o; |
692 } | |
693 | |
8494 | 694 static inline vec_u8 h264_deblock_mask ( register vec_u8 p0, |
695 register vec_u8 p1, | |
696 register vec_u8 q0, | |
697 register vec_u8 q1, | |
698 register vec_u8 alpha, | |
699 register vec_u8 beta) { | |
5119 | 700 |
8494 | 701 register vec_u8 mask; |
702 register vec_u8 tempmask; | |
5119 | 703 |
704 mask = diff_lt_altivec(p0, q0, alpha); | |
705 tempmask = diff_lt_altivec(p1, p0, beta); | |
706 mask = vec_and(mask, tempmask); | |
707 tempmask = diff_lt_altivec(q1, q0, beta); | |
708 mask = vec_and(mask, tempmask); | |
709 | |
710 return mask; | |
711 } | |
712 | |
5165
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
713 // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) |
8494 | 714 static inline vec_u8 h264_deblock_q1(register vec_u8 p0, |
715 register vec_u8 p1, | |
716 register vec_u8 p2, | |
717 register vec_u8 q0, | |
718 register vec_u8 tc0) { | |
5164 | 719 |
8494 | 720 register vec_u8 average = vec_avg(p0, q0); |
721 register vec_u8 temp; | |
722 register vec_u8 uncliped; | |
723 register vec_u8 ones; | |
724 register vec_u8 max; | |
725 register vec_u8 min; | |
726 register vec_u8 newp1; | |
5164 | 727 |
728 temp = vec_xor(average, p2); | |
729 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ | |
730 ones = vec_splat_u8(1); | |
731 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ | |
732 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ | |
733 max = vec_adds(p1, tc0); | |
734 min = vec_subs(p1, tc0); | |
5165
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
735 newp1 = vec_max(min, uncliped); |
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
736 newp1 = vec_min(max, newp1); |
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
737 return newp1; |
5119 | 738 } |
739 | |
740 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ | |
741 \ | |
8494 | 742 const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ |
5119 | 743 \ |
8494 | 744 register vec_u8 pq0bit = vec_xor(p0,q0); \ |
745 register vec_u8 q1minus; \ | |
746 register vec_u8 p0minus; \ | |
747 register vec_u8 stage1; \ | |
748 register vec_u8 stage2; \ | |
749 register vec_u8 vec160; \ | |
750 register vec_u8 delta; \ | |
751 register vec_u8 deltaneg; \ | |
5119 | 752 \ |
5159
883a5619f52f
Use a faster way to compute 255-val: Instead of creating a vector of
gpoirier
parents:
5145
diff
changeset
|
753 q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ |
5119 | 754 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ |
755 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ | |
5159
883a5619f52f
Use a faster way to compute 255-val: Instead of creating a vector of
gpoirier
parents:
5145
diff
changeset
|
756 p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ |
5119 | 757 stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ |
758 pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \ | |
759 stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ | |
760 stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ | |
761 vec160 = vec_ld(0, &A0v); \ | |
762 deltaneg = vec_subs(vec160, stage2); /* -d */ \ | |
763 delta = vec_subs(stage2, vec160); /* d */ \ | |
764 deltaneg = vec_min(tc0masked, deltaneg); \ | |
765 delta = vec_min(tc0masked, delta); \ | |
766 p0 = vec_subs(p0, deltaneg); \ | |
767 q0 = vec_subs(q0, delta); \ | |
768 p0 = vec_adds(p0, delta); \ | |
769 q0 = vec_adds(q0, deltaneg); \ | |
770 } | |
771 | |
772 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ | |
11369 | 773 DECLARE_ALIGNED(16, unsigned char, temp)[16]; \ |
8494 | 774 register vec_u8 alphavec; \ |
775 register vec_u8 betavec; \ | |
776 register vec_u8 mask; \ | |
777 register vec_u8 p1mask; \ | |
778 register vec_u8 q1mask; \ | |
5167 | 779 register vector signed char tc0vec; \ |
8494 | 780 register vec_u8 finaltc0; \ |
781 register vec_u8 tc0masked; \ | |
782 register vec_u8 newp1; \ | |
783 register vec_u8 newq1; \ | |
5119 | 784 \ |
785 temp[0] = alpha; \ | |
786 temp[1] = beta; \ | |
787 alphavec = vec_ld(0, temp); \ | |
788 betavec = vec_splat(alphavec, 0x1); \ | |
789 alphavec = vec_splat(alphavec, 0x0); \ | |
790 mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \ | |
791 \ | |
792 *((int *)temp) = *((int *)tc0); \ | |
5167 | 793 tc0vec = vec_ld(0, (signed char*)temp); \ |
5119 | 794 tc0vec = vec_mergeh(tc0vec, tc0vec); \ |
795 tc0vec = vec_mergeh(tc0vec, tc0vec); \ | |
5165
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
796 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ |
8494 | 797 finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \ |
5119 | 798 \ |
799 p1mask = diff_lt_altivec(p2, p0, betavec); \ | |
5586 | 800 p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ |
8494 | 801 tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \ |
5119 | 802 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ |
5165
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
803 newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ |
5119 | 804 /*end if*/ \ |
805 \ | |
806 q1mask = diff_lt_altivec(q2, q0, betavec); \ | |
807 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ | |
8494 | 808 tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \ |
5119 | 809 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ |
5165
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
810 newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ |
5119 | 811 /*end if*/ \ |
812 \ | |
813 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ | |
5165
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
814 p1 = newp1; \ |
c99fa49eaa80
part 2/2 of fixing Altivec-accelerated H264 luma inloop filter
gpoirier
parents:
5164
diff
changeset
|
815 q1 = newq1; \ |
5119 | 816 } |
817 | |
818 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { | |
819 | |
5586 | 820 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { |
8494 | 821 register vec_u8 p2 = vec_ld(-3*stride, pix); |
822 register vec_u8 p1 = vec_ld(-2*stride, pix); | |
823 register vec_u8 p0 = vec_ld(-1*stride, pix); | |
824 register vec_u8 q0 = vec_ld(0, pix); | |
825 register vec_u8 q1 = vec_ld(stride, pix); | |
826 register vec_u8 q2 = vec_ld(2*stride, pix); | |
5119 | 827 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); |
828 vec_st(p1, -2*stride, pix); | |
829 vec_st(p0, -1*stride, pix); | |
830 vec_st(q0, 0, pix); | |
831 vec_st(q1, stride, pix); | |
832 } | |
833 } | |
834 | |
835 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { | |
836 | |
8494 | 837 register vec_u8 line0, line1, line2, line3, line4, line5; |
5586 | 838 if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) |
5119 | 839 return; |
840 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); | |
841 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); | |
842 transpose4x16(line1, line2, line3, line4); | |
843 write16x4(pix-2, stride, line1, line2, line3, line4); | |
844 } | |
845 | |
8541
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
846 static av_always_inline |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
847 void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
848 { |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
849 int y, aligned; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
850 vec_u8 vblock; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
851 vec_s16 vtemp, vweight, voffset, v0, v1; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
852 vec_u16 vlog2_denom; |
11369 | 853 DECLARE_ALIGNED(16, int32_t, temp)[4]; |
8541
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
854 LOAD_ZERO; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
855 |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
856 offset <<= log2_denom; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
857 if(log2_denom) offset += 1<<(log2_denom-1); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
858 temp[0] = log2_denom; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
859 temp[1] = weight; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
860 temp[2] = offset; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
861 |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
862 vtemp = (vec_s16)vec_ld(0, temp); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
863 vlog2_denom = (vec_u16)vec_splat(vtemp, 1); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
864 vweight = vec_splat(vtemp, 3); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
865 voffset = vec_splat(vtemp, 5); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
866 aligned = !((unsigned long)block & 0xf); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
867 |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
868 for (y=0; y<h; y++) { |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
869 vblock = vec_ld(0, block); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
870 |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
871 v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
872 v1 = (vec_s16)vec_mergel(zero_u8v, vblock); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
873 |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
874 if (w == 16 || aligned) { |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
875 v0 = vec_mladd(v0, vweight, zero_s16v); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
876 v0 = vec_adds(v0, voffset); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
877 v0 = vec_sra(v0, vlog2_denom); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
878 } |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
879 if (w == 16 || !aligned) { |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
880 v1 = vec_mladd(v1, vweight, zero_s16v); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
881 v1 = vec_adds(v1, voffset); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
882 v1 = vec_sra(v1, vlog2_denom); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
883 } |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
884 vblock = vec_packsu(v0, v1); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
885 vec_st(vblock, 0, block); |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
886 |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
887 block += stride; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
888 } |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
889 } |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
890 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
891 static av_always_inline |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
892 void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
893 int weightd, int weights, int offset, int w, int h) |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
894 { |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
895 int y, dst_aligned, src_aligned; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
896 vec_u8 vsrc, vdst; |
8535
8f3e20061aff
offset and weights are signed, fixes some non-bitexact issues.
gpoirier
parents:
8531
diff
changeset
|
897 vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3; |
8f3e20061aff
offset and weights are signed, fixes some non-bitexact issues.
gpoirier
parents:
8531
diff
changeset
|
898 vec_u16 vlog2_denom; |
11369 | 899 DECLARE_ALIGNED(16, int32_t, temp)[4]; |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
900 LOAD_ZERO; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
901 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
902 offset = ((offset + 1) | 1) << log2_denom; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
903 temp[0] = log2_denom+1; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
904 temp[1] = weights; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
905 temp[2] = weightd; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
906 temp[3] = offset; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
907 |
8536 | 908 vtemp = (vec_s16)vec_ld(0, temp); |
909 vlog2_denom = (vec_u16)vec_splat(vtemp, 1); | |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
910 vweights = vec_splat(vtemp, 3); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
911 vweightd = vec_splat(vtemp, 5); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
912 voffset = vec_splat(vtemp, 7); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
913 dst_aligned = !((unsigned long)dst & 0xf); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
914 src_aligned = !((unsigned long)src & 0xf); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
915 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
916 for (y=0; y<h; y++) { |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
917 vdst = vec_ld(0, dst); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
918 vsrc = vec_ld(0, src); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
919 |
8536 | 920 v0 = (vec_s16)vec_mergeh(zero_u8v, vdst); |
921 v1 = (vec_s16)vec_mergel(zero_u8v, vdst); | |
922 v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc); | |
923 v3 = (vec_s16)vec_mergel(zero_u8v, vsrc); | |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
924 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
925 if (w == 8) { |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
926 if (src_aligned) |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
927 v3 = v2; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
928 else |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
929 v2 = v3; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
930 } |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
931 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
932 if (w == 16 || dst_aligned) { |
8535
8f3e20061aff
offset and weights are signed, fixes some non-bitexact issues.
gpoirier
parents:
8531
diff
changeset
|
933 v0 = vec_mladd(v0, vweightd, zero_s16v); |
8f3e20061aff
offset and weights are signed, fixes some non-bitexact issues.
gpoirier
parents:
8531
diff
changeset
|
934 v2 = vec_mladd(v2, vweights, zero_s16v); |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
935 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
936 v0 = vec_adds(v0, voffset); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
937 v0 = vec_adds(v0, v2); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
938 v0 = vec_sra(v0, vlog2_denom); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
939 } |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
940 if (w == 16 || !dst_aligned) { |
8535
8f3e20061aff
offset and weights are signed, fixes some non-bitexact issues.
gpoirier
parents:
8531
diff
changeset
|
941 v1 = vec_mladd(v1, vweightd, zero_s16v); |
8f3e20061aff
offset and weights are signed, fixes some non-bitexact issues.
gpoirier
parents:
8531
diff
changeset
|
942 v3 = vec_mladd(v3, vweights, zero_s16v); |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
943 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
944 v1 = vec_adds(v1, voffset); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
945 v1 = vec_adds(v1, v3); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
946 v1 = vec_sra(v1, vlog2_denom); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
947 } |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
948 vdst = vec_packsu(v0, v1); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
949 vec_st(vdst, 0, dst); |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
950 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
951 dst += stride; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
952 src += stride; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
953 } |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
954 } |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
955 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
956 #define H264_WEIGHT(W,H) \ |
8541
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
957 static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
958 weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \ |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
959 }\ |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
960 static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
961 biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
962 } |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
963 |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
964 H264_WEIGHT(16,16) |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
965 H264_WEIGHT(16, 8) |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
966 H264_WEIGHT( 8,16) |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
967 H264_WEIGHT( 8, 8) |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
968 H264_WEIGHT( 8, 4) |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
969 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
970 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { |
2967 | 971 |
5586 | 972 if (has_altivec()) { |
973 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; | |
974 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; | |
9439
ef3a7b711cc0
Rename put_no_rnd_h264_chroma* to reflect its usage in VC1 only
conrad
parents:
9421
diff
changeset
|
975 c->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec; |
9444 | 976 c->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
977 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
978 #define dspfunc(PFX, IDX, NUM) \ |
5586 | 979 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ |
980 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ | |
981 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ | |
982 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ | |
983 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ | |
984 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ | |
985 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ | |
986 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ | |
987 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ | |
988 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ | |
989 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ | |
990 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ | |
991 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ | |
992 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ | |
993 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ | |
994 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec | |
2967 | 995 |
5586 | 996 dspfunc(put_h264_qpel, 0, 16); |
997 dspfunc(avg_h264_qpel, 0, 16); | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
998 #undef dspfunc |
11499 | 999 } |
1000 } | |
1001 | |
1002 void ff_h264dsp_init_ppc(H264DSPContext *c) | |
1003 { | |
1004 if (has_altivec()) { | |
1005 c->h264_idct_add = ff_h264_idct_add_altivec; | |
1006 c->h264_idct_add8 = ff_h264_idct_add8_altivec; | |
1007 c->h264_idct_add16 = ff_h264_idct_add16_altivec; | |
1008 c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec; | |
1009 c->h264_idct_dc_add= h264_idct_dc_add_altivec; | |
1010 c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec; | |
1011 c->h264_idct8_add = ff_h264_idct8_add_altivec; | |
1012 c->h264_idct8_add4 = ff_h264_idct8_add4_altivec; | |
1013 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; | |
1014 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; | |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
1015 |
8541
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
1016 c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
1017 c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
1018 c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
1019 c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; |
16a315fdad0b
add AltiVec implementation of weight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8536
diff
changeset
|
1020 c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; |
8531
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
1021 c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
1022 c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
1023 c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
1024 c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; |
961e40a13102
add AltiVec implementation of biweight_h264_pixels(16|8)x(16|8|4)
gpoirier
parents:
8530
diff
changeset
|
1025 c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; |
5586 | 1026 } |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
1027 } |