annotate ppc/h264_template_altivec.c @ 7056:0c65c19e5aaa libavcodec

Do not inline g726_iterate() the function is big so its inlining will not help speedwise IMHO. .o size changes from 70k -> 49k
author michael
date Tue, 17 Jun 2008 00:09:42 +0000
parents 292269939c50
children a8a79f5385f6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
1 /*
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
3 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
4 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
5 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
10 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
14 * Lesser General Public License for more details.
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
15 *
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2967
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
19 */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
20
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
21 //#define DEBUG_ALIGNMENT
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
22 #ifdef DEBUG_ALIGNMENT
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
24 #else
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
25 #define ASSERT_ALIGNED(ptr) ;
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
26 #endif
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
27
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
28 /* this code assume that stride % 16 == 0 */
6059
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
29
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
30 #define CHROMA_MC8_ALTIVEC_CORE \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
31 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
32 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
33 \
6061
95ac40977cf7 Avoid a vec_add, directly start with sum
lu_zero
parents: 6060
diff changeset
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
6059
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
38 psum = vec_sr(psum, v6us);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
39 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
40 vdst = vec_ld(0, dst);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
41 ppsum = (vec_u8_t)vec_pack(psum, psum);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
42 vfdst = vec_perm(vdst, ppsum, fperm);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
43 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
45 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
46 vec_st(fsum, 0, dst);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
47 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
48 vsrc0ssH = vsrc2ssH;\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
49 vsrc1ssH = vsrc3ssH;\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
50 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
51 dst += stride;\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
52 src += stride;
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
53
6062
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
6064
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
55 \
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
56 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
57 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
6062
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
58 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
6064
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
60 psum = vec_mladd(vE, vsrc1ssH, psum);\
6062
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
61 psum = vec_sr(psum, v6us);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
62 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
63 vdst = vec_ld(0, dst);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
64 ppsum = (vec_u8_t)vec_pack(psum, psum);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
65 vfdst = vec_perm(vdst, ppsum, fperm);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
66 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
68 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
69 vec_st(fsum, 0, dst);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
70 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
71 dst += stride;\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
72 src += stride;
6059
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
73
6063
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
75 int stride, int h, int x, int y) {
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
5019
41cabe79ba25 use macro Use DECLARE_ALIGNED_16 to align stack-allocated variables
gpoirier
parents: 3947
diff changeset
77 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
78 {((8 - x) * (8 - y)),
6058
93089aed00cb Cosmetics
lu_zero
parents: 5603
diff changeset
79 (( x) * (8 - y)),
93089aed00cb Cosmetics
lu_zero
parents: 5603
diff changeset
80 ((8 - x) * ( y)),
93089aed00cb Cosmetics
lu_zero
parents: 5603
diff changeset
81 (( x) * ( y))};
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
82 register int i;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
83 vec_u8_t fperm;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
84 const vec_s32_t vABCD = vec_ld(0, ABCD);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
85 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
86 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
87 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
88 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
89 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
90 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
91 const vec_u16_t v6us = vec_splat_u16(6);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
94
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
95 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
96 vec_u8_t vsrc0uc, vsrc1uc;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
97 vec_s16_t vsrc0ssH, vsrc1ssH;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
98 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
99 vec_s16_t vsrc2ssH, vsrc3ssH, psum;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
100 vec_u8_t vdst, ppsum, vfdst, fsum;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
101
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
103
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
104 if (((unsigned long)dst) % 16 == 0) {
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
105 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
106 0x14, 0x15, 0x16, 0x17,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
107 0x08, 0x09, 0x0A, 0x0B,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
108 0x0C, 0x0D, 0x0E, 0x0F);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
109 } else {
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
110 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
111 0x04, 0x05, 0x06, 0x07,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
112 0x18, 0x19, 0x1A, 0x1B,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
113 0x1C, 0x1D, 0x1E, 0x1F);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
114 }
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
115
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
116 vsrcAuc = vec_ld(0, src);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
117
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
118 if (loadSecond)
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
119 vsrcBuc = vec_ld(16, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
120 vsrcperm0 = vec_lvsl(0, src);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
121 vsrcperm1 = vec_lvsl(1, src);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
122
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
123 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
124 if (reallyBadAlign)
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
125 vsrc1uc = vsrcBuc;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
126 else
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
128
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
129 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
130 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
131
6062
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
132 if (ABCD[3]) {
6063
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
133 if (!loadSecond) {// -> !reallyBadAlign
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
134 for (i = 0 ; i < h ; i++) {
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
135 vsrcCuc = vec_ld(stride + 0, src);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
138
6063
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
139 CHROMA_MC8_ALTIVEC_CORE
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
140 }
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
141 } else {
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
142 vec_u8_t vsrcDuc;
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
143 for (i = 0 ; i < h ; i++) {
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
144 vsrcCuc = vec_ld(stride + 0, src);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
145 vsrcDuc = vec_ld(stride + 16, src);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
147 if (reallyBadAlign)
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
148 vsrc3uc = vsrcDuc;
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
149 else
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
151
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
152 CHROMA_MC8_ALTIVEC_CORE
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
153 }
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
154 }
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
155 } else {
6064
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
156 const vec_s16_t vE = vec_add(vB, vC);
6065
180976fd652e 10l do not load after the buffer...
lu_zero
parents: 6064
diff changeset
157 if (ABCD[2]) { // x == 0 B == 0
6067
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
158 if (!loadSecond) {// -> !reallyBadAlign
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
159 for (i = 0 ; i < h ; i++) {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
160 vsrcCuc = vec_ld(stride + 0, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
163
6067
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
164 vsrc0uc = vsrc1uc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
165 }
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
166 } else {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
167 vec_u8_t vsrcDuc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
168 for (i = 0 ; i < h ; i++) {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
169 vsrcCuc = vec_ld(stride + 0, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
170 vsrcDuc = vec_ld(stride + 15, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
173
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
174 vsrc0uc = vsrc1uc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
175 }
6063
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
176 }
6065
180976fd652e 10l do not load after the buffer...
lu_zero
parents: 6064
diff changeset
177 } else { // y == 0 C == 0
6067
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
178 if (!loadSecond) {// -> !reallyBadAlign
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
179 for (i = 0 ; i < h ; i++) {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
180 vsrcCuc = vec_ld(0, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
181 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
6065
180976fd652e 10l do not load after the buffer...
lu_zero
parents: 6064
diff changeset
183
6067
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
185 }
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
186 } else {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
187 vec_u8_t vsrcDuc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
188 for (i = 0 ; i < h ; i++) {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
189 vsrcCuc = vec_ld(0, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
190 vsrcDuc = vec_ld(15, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
192 if (reallyBadAlign)
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
193 vsrc1uc = vsrcDuc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
194 else
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
195 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
196
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
198 }
6064
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
199 }
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
200 }
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
201 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
203 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
204
6059
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
205 #undef CHROMA_MC8_ALTIVEC_CORE
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
206
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
207 /* this code assume stride % 16 == 0 */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
210 register int i;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
211
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
212 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
213 const vec_u8_t permM2 = vec_lvsl(-2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
214 const vec_u8_t permM1 = vec_lvsl(-1, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
215 const vec_u8_t permP0 = vec_lvsl(+0, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
216 const vec_u8_t permP1 = vec_lvsl(+1, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
217 const vec_u8_t permP2 = vec_lvsl(+2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
218 const vec_u8_t permP3 = vec_lvsl(+3, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
219 const vec_s16_t v5ss = vec_splat_s16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
220 const vec_u16_t v5us = vec_splat_u16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
221 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
222 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
223
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
224 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
225
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
226 register int align = ((((unsigned long)src) - 2) % 16);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
227
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
228 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
229 srcP2A, srcP2B, srcP3A, srcP3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
230 srcM1A, srcM1B, srcM2A, srcM2B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
233 psumA, psumB, sumA, sumB;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
234
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
235 vec_u8_t sum, vdst, fsum;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
236
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
238
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
239 for (i = 0 ; i < 16 ; i ++) {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
240 vec_u8_t srcR1 = vec_ld(-2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
241 vec_u8_t srcR2 = vec_ld(14, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
242
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
243 switch (align) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
244 default: {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
245 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
246 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
247 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
248 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
249 srcP2 = vec_perm(srcR1, srcR2, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
250 srcP3 = vec_perm(srcR1, srcR2, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
251 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
252 case 11: {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
253 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
254 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
255 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
256 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
257 srcP2 = vec_perm(srcR1, srcR2, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
258 srcP3 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
259 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
260 case 12: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
261 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
262 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
263 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
264 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
265 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
266 srcP2 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
267 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
268 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
269 case 13: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
270 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
271 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
272 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
273 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
274 srcP1 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
275 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
276 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
277 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
278 case 14: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
279 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
280 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
281 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
282 srcP0 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
283 srcP1 = vec_perm(srcR2, srcR3, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
284 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
285 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
286 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
287 case 15: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
288 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
289 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
290 srcM1 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
291 srcP0 = vec_perm(srcR2, srcR3, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
292 srcP1 = vec_perm(srcR2, srcR3, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
293 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
294 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
295 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
296 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
297
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
298 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
299 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
300 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
301 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
302
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
303 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
304 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
305 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
306 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
307
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
308 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
309 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
310 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
311 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
312
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
313 sum1A = vec_adds(srcP0A, srcP1A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
314 sum1B = vec_adds(srcP0B, srcP1B);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
315 sum2A = vec_adds(srcM1A, srcP2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
316 sum2B = vec_adds(srcM1B, srcP2B);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
317 sum3A = vec_adds(srcM2A, srcP3A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
318 sum3B = vec_adds(srcM2B, srcP3B);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
319
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
320 pp1A = vec_mladd(sum1A, v20ss, v16ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
321 pp1B = vec_mladd(sum1B, v20ss, v16ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
322
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
323 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
324 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
325
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
326 pp3A = vec_add(sum3A, pp1A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
327 pp3B = vec_add(sum3B, pp1B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
328
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
329 psumA = vec_sub(pp3A, pp2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
330 psumB = vec_sub(pp3B, pp2B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
331
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
332 sumA = vec_sra(psumA, v5us);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
333 sumB = vec_sra(psumB, v5us);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
334
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
335 sum = vec_packsu(sumA, sumB);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
336
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
337 ASSERT_ALIGNED(dst);
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
338 vdst = vec_ld(0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
339
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
340 OP_U8_ALTIVEC(fsum, sum, vdst);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
341
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
342 vec_st(fsum, 0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
343
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
344 src += srcStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
345 dst += dstStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
346 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
348 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
349
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
350 /* this code assume stride % 16 == 0 */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
351 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
353
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
354 register int i;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
355
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
356 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
357 const vec_u8_t perm = vec_lvsl(0, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
358 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
359 const vec_u16_t v5us = vec_splat_u16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
360 const vec_s16_t v5ss = vec_splat_s16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
361 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
362
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
363 uint8_t *srcbis = src - (srcStride * 2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
364
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
365 const vec_u8_t srcM2a = vec_ld(0, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
366 const vec_u8_t srcM2b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
367 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
368 // srcbis += srcStride;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
369 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
370 const vec_u8_t srcM1b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
371 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
372 // srcbis += srcStride;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
373 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
374 const vec_u8_t srcP0b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
375 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
376 // srcbis += srcStride;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
377 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
378 const vec_u8_t srcP1b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
379 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
380 // srcbis += srcStride;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
381 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
382 const vec_u8_t srcP2b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
383 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
384 // srcbis += srcStride;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
385
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
386 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
387 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
388 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
389 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
390 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
391 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
392 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
393 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
394 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
395 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
396
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
397 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
398 psumA, psumB, sumA, sumB,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
399 srcP3ssA, srcP3ssB,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
401
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
402 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
403
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
405
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
406 for (i = 0 ; i < 16 ; i++) {
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
407 srcP3a = vec_ld(0, srcbis += srcStride);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
408 srcP3b = vec_ld(16, srcbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
409 srcP3 = vec_perm(srcP3a, srcP3b, perm);
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
410 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
411 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
412 // srcbis += srcStride;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
413
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
414 sum1A = vec_adds(srcP0ssA, srcP1ssA);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
415 sum1B = vec_adds(srcP0ssB, srcP1ssB);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
416 sum2A = vec_adds(srcM1ssA, srcP2ssA);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
417 sum2B = vec_adds(srcM1ssB, srcP2ssB);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
418 sum3A = vec_adds(srcM2ssA, srcP3ssA);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
419 sum3B = vec_adds(srcM2ssB, srcP3ssB);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
420
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
421 srcM2ssA = srcM1ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
422 srcM2ssB = srcM1ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
423 srcM1ssA = srcP0ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
424 srcM1ssB = srcP0ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
425 srcP0ssA = srcP1ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
426 srcP0ssB = srcP1ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
427 srcP1ssA = srcP2ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
428 srcP1ssB = srcP2ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
429 srcP2ssA = srcP3ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
430 srcP2ssB = srcP3ssB;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
431
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
432 pp1A = vec_mladd(sum1A, v20ss, v16ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
433 pp1B = vec_mladd(sum1B, v20ss, v16ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
434
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
435 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
436 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
437
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
438 pp3A = vec_add(sum3A, pp1A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
439 pp3B = vec_add(sum3B, pp1B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
440
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
441 psumA = vec_sub(pp3A, pp2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
442 psumB = vec_sub(pp3B, pp2B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
443
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
444 sumA = vec_sra(psumA, v5us);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
445 sumB = vec_sra(psumB, v5us);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
446
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
447 sum = vec_packsu(sumA, sumB);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
448
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
449 ASSERT_ALIGNED(dst);
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
450 vdst = vec_ld(0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
451
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
452 OP_U8_ALTIVEC(fsum, sum, vdst);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
453
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
454 vec_st(fsum, 0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
455
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
456 dst += dstStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
457 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
459 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
460
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
464 register int i;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
465 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
466 const vec_u8_t permM2 = vec_lvsl(-2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
467 const vec_u8_t permM1 = vec_lvsl(-1, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
468 const vec_u8_t permP0 = vec_lvsl(+0, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
469 const vec_u8_t permP1 = vec_lvsl(+1, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
470 const vec_u8_t permP2 = vec_lvsl(+2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
471 const vec_u8_t permP3 = vec_lvsl(+3, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
472 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
473 const vec_u32_t v10ui = vec_splat_u32(10);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
474 const vec_s16_t v5ss = vec_splat_s16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
475 const vec_s16_t v1ss = vec_splat_s16(1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
476 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
477 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
478
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
479 register int align = ((((unsigned long)src) - 2) % 16);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
480
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
481 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
482 srcP2A, srcP2B, srcP3A, srcP3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
483 srcM1A, srcM1B, srcM2A, srcM2B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
485 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
486
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
487 const vec_u8_t mperm = (const vec_u8_t)
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
488 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
490 int16_t *tmpbis = tmp;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
491
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
492 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
494 tmpP2ssA, tmpP2ssB;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
495
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
496 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
499 ssumAe, ssumAo, ssumBe, ssumBo;
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
500 vec_u8_t fsum, sumv, sum, vdst;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
501 vec_s16_t ssume, ssumo;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
502
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
504 src -= (2 * srcStride);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
505 for (i = 0 ; i < 21 ; i ++) {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
506 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
507 vec_u8_t srcR1 = vec_ld(-2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
508 vec_u8_t srcR2 = vec_ld(14, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
509
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
510 switch (align) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
511 default: {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
512 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
513 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
514 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
515 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
516 srcP2 = vec_perm(srcR1, srcR2, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
517 srcP3 = vec_perm(srcR1, srcR2, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
518 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
519 case 11: {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
520 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
521 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
522 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
523 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
524 srcP2 = vec_perm(srcR1, srcR2, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
525 srcP3 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
526 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
527 case 12: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
528 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
529 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
530 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
531 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
532 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
533 srcP2 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
534 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
535 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
536 case 13: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
537 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
538 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
539 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
540 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
541 srcP1 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
542 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
543 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
544 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
545 case 14: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
546 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
547 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
548 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
549 srcP0 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
550 srcP1 = vec_perm(srcR2, srcR3, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
551 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
552 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
553 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
554 case 15: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
555 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
556 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
557 srcM1 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
558 srcP0 = vec_perm(srcR2, srcR3, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
559 srcP1 = vec_perm(srcR2, srcR3, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
560 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
561 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
562 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
563 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
564
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
565 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
566 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
567 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
568 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
569
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
570 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
571 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
572 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
573 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
574
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
575 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
576 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
577 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
578 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
579
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
580 sum1A = vec_adds(srcP0A, srcP1A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
581 sum1B = vec_adds(srcP0B, srcP1B);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
582 sum2A = vec_adds(srcM1A, srcP2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
583 sum2B = vec_adds(srcM1B, srcP2B);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
584 sum3A = vec_adds(srcM2A, srcP3A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
585 sum3B = vec_adds(srcM2B, srcP3B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
586
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
587 pp1A = vec_mladd(sum1A, v20ss, sum3A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
588 pp1B = vec_mladd(sum1B, v20ss, sum3B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
589
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
590 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
591 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
592
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
593 psumA = vec_sub(pp1A, pp2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
594 psumB = vec_sub(pp1B, pp2B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
595
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
596 vec_st(psumA, 0, tmp);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
597 vec_st(psumB, 16, tmp);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
598
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
599 src += srcStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
600 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
601 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
602
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
603 tmpM2ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
604 tmpM2ssB = vec_ld(16, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
605 tmpbis += tmpStride;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
606 tmpM1ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
607 tmpM1ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
608 tmpbis += tmpStride;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
609 tmpP0ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
610 tmpP0ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
611 tmpbis += tmpStride;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
612 tmpP1ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
613 tmpP1ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
614 tmpbis += tmpStride;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
615 tmpP2ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
616 tmpP2ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
617 tmpbis += tmpStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
618
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
619 for (i = 0 ; i < 16 ; i++) {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
620 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
621 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
622
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
623 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
624 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
625 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
626 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
627 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
628 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
629
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
630 tmpbis += tmpStride;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
631
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
632 tmpM2ssA = tmpM1ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
633 tmpM2ssB = tmpM1ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
634 tmpM1ssA = tmpP0ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
635 tmpM1ssB = tmpP0ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
636 tmpP0ssA = tmpP1ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
637 tmpP0ssB = tmpP1ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
638 tmpP1ssA = tmpP2ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
639 tmpP1ssB = tmpP2ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
640 tmpP2ssA = tmpP3ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
641 tmpP2ssB = tmpP3ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
642
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
643 pp1Ae = vec_mule(sum1A, v20ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
644 pp1Ao = vec_mulo(sum1A, v20ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
645 pp1Be = vec_mule(sum1B, v20ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
646 pp1Bo = vec_mulo(sum1B, v20ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
647
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
648 pp2Ae = vec_mule(sum2A, v5ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
649 pp2Ao = vec_mulo(sum2A, v5ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
650 pp2Be = vec_mule(sum2B, v5ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
651 pp2Bo = vec_mulo(sum2B, v5ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
652
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
653 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
654 pp3Ao = vec_mulo(sum3A, v1ss);
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
655 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
656 pp3Bo = vec_mulo(sum3B, v1ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
657
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
658 pp1cAe = vec_add(pp1Ae, v512si);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
659 pp1cAo = vec_add(pp1Ao, v512si);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
660 pp1cBe = vec_add(pp1Be, v512si);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
661 pp1cBo = vec_add(pp1Bo, v512si);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
662
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
663 pp32Ae = vec_sub(pp3Ae, pp2Ae);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
664 pp32Ao = vec_sub(pp3Ao, pp2Ao);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
665 pp32Be = vec_sub(pp3Be, pp2Be);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
666 pp32Bo = vec_sub(pp3Bo, pp2Bo);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
667
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
668 sumAe = vec_add(pp1cAe, pp32Ae);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
669 sumAo = vec_add(pp1cAo, pp32Ao);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
670 sumBe = vec_add(pp1cBe, pp32Be);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
671 sumBo = vec_add(pp1cBo, pp32Bo);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
672
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
673 ssumAe = vec_sra(sumAe, v10ui);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
674 ssumAo = vec_sra(sumAo, v10ui);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
675 ssumBe = vec_sra(sumBe, v10ui);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
676 ssumBo = vec_sra(sumBo, v10ui);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
677
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
678 ssume = vec_packs(ssumAe, ssumBe);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
679 ssumo = vec_packs(ssumAo, ssumBo);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
680
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
681 sumv = vec_packsu(ssume, ssumo);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
682 sum = vec_perm(sumv, sumv, mperm);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
683
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
684 ASSERT_ALIGNED(dst);
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
685 vdst = vec_ld(0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
686
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
687 OP_U8_ALTIVEC(fsum, sum, vdst);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
688
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
689 vec_st(fsum, 0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
690
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
691 dst += dstStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
692 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
694 }