annotate ppc/h264_template_altivec.c @ 6323:e6da66f378c7 libavcodec

mpegvideo.h has two function declarations with the 'inline' specifier but no definition for those functions. The C standard requires a definition to appear in the same translation unit for any function declared with 'inline'. Most of the files including mpegvideo.h do not define those functions. Fix this by removing the 'inline' specifiers from the header. patch by Uoti Urpala
author diego
date Sun, 03 Feb 2008 17:54:30 +0000
parents 292269939c50
children a8a79f5385f6
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
1 /*
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
3 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
4 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
5 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
10 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
14 * Lesser General Public License for more details.
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
15 *
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3577
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
3036
0b546eab515d Update licensing information: The FSF changed postal address.
diego
parents: 2967
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
19 */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
20
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
21 //#define DEBUG_ALIGNMENT
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
22 #ifdef DEBUG_ALIGNMENT
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
24 #else
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
25 #define ASSERT_ALIGNED(ptr) ;
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
26 #endif
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
27
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
28 /* this code assume that stride % 16 == 0 */
6059
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
29
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
30 #define CHROMA_MC8_ALTIVEC_CORE \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
31 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
32 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
33 \
6061
95ac40977cf7 Avoid a vec_add, directly start with sum
lu_zero
parents: 6060
diff changeset
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
6059
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
38 psum = vec_sr(psum, v6us);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
39 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
40 vdst = vec_ld(0, dst);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
41 ppsum = (vec_u8_t)vec_pack(psum, psum);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
42 vfdst = vec_perm(vdst, ppsum, fperm);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
43 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
45 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
46 vec_st(fsum, 0, dst);\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
47 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
48 vsrc0ssH = vsrc2ssH;\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
49 vsrc1ssH = vsrc3ssH;\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
50 \
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
51 dst += stride;\
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
52 src += stride;
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
53
6062
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
6064
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
55 \
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
56 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
57 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
6062
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
58 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
6064
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
60 psum = vec_mladd(vE, vsrc1ssH, psum);\
6062
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
61 psum = vec_sr(psum, v6us);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
62 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
63 vdst = vec_ld(0, dst);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
64 ppsum = (vec_u8_t)vec_pack(psum, psum);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
65 vfdst = vec_perm(vdst, ppsum, fperm);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
66 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
68 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
69 vec_st(fsum, 0, dst);\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
70 \
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
71 dst += stride;\
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
72 src += stride;
6059
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
73
6063
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
75 int stride, int h, int x, int y) {
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
5019
41cabe79ba25 use macro Use DECLARE_ALIGNED_16 to align stack-allocated variables
gpoirier
parents: 3947
diff changeset
77 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
78 {((8 - x) * (8 - y)),
6058
93089aed00cb Cosmetics
lu_zero
parents: 5603
diff changeset
79 (( x) * (8 - y)),
93089aed00cb Cosmetics
lu_zero
parents: 5603
diff changeset
80 ((8 - x) * ( y)),
93089aed00cb Cosmetics
lu_zero
parents: 5603
diff changeset
81 (( x) * ( y))};
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
82 register int i;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
83 vec_u8_t fperm;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
84 const vec_s32_t vABCD = vec_ld(0, ABCD);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
85 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
86 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
87 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
88 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
89 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
90 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
91 const vec_u16_t v6us = vec_splat_u16(6);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
94
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
95 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
96 vec_u8_t vsrc0uc, vsrc1uc;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
97 vec_s16_t vsrc0ssH, vsrc1ssH;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
98 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
99 vec_s16_t vsrc2ssH, vsrc3ssH, psum;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
100 vec_u8_t vdst, ppsum, vfdst, fsum;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
101
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
103
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
104 if (((unsigned long)dst) % 16 == 0) {
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
105 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
106 0x14, 0x15, 0x16, 0x17,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
107 0x08, 0x09, 0x0A, 0x0B,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
108 0x0C, 0x0D, 0x0E, 0x0F);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
109 } else {
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
110 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
111 0x04, 0x05, 0x06, 0x07,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
112 0x18, 0x19, 0x1A, 0x1B,
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
113 0x1C, 0x1D, 0x1E, 0x1F);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
114 }
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
115
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
116 vsrcAuc = vec_ld(0, src);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
117
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
118 if (loadSecond)
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
119 vsrcBuc = vec_ld(16, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
120 vsrcperm0 = vec_lvsl(0, src);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
121 vsrcperm1 = vec_lvsl(1, src);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
122
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
123 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
124 if (reallyBadAlign)
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
125 vsrc1uc = vsrcBuc;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
126 else
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
128
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
129 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
130 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
131
6062
9d1590a4df90 Partially address issue299, no performance change apparently
lu_zero
parents: 6061
diff changeset
132 if (ABCD[3]) {
6063
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
133 if (!loadSecond) {// -> !reallyBadAlign
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
134 for (i = 0 ; i < h ; i++) {
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
135 vsrcCuc = vec_ld(stride + 0, src);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
138
6063
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
139 CHROMA_MC8_ALTIVEC_CORE
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
140 }
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
141 } else {
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
142 vec_u8_t vsrcDuc;
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
143 for (i = 0 ; i < h ; i++) {
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
144 vsrcCuc = vec_ld(stride + 0, src);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
145 vsrcDuc = vec_ld(stride + 16, src);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
147 if (reallyBadAlign)
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
148 vsrc3uc = vsrcDuc;
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
149 else
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
151
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
152 CHROMA_MC8_ALTIVEC_CORE
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
153 }
6060
b1e3368c0d5e Reindent
lu_zero
parents: 6059
diff changeset
154 }
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
155 } else {
6064
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
156 const vec_s16_t vE = vec_add(vB, vC);
6065
180976fd652e 10l do not load after the buffer...
lu_zero
parents: 6064
diff changeset
157 if (ABCD[2]) { // x == 0 B == 0
6067
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
158 if (!loadSecond) {// -> !reallyBadAlign
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
159 for (i = 0 ; i < h ; i++) {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
160 vsrcCuc = vec_ld(stride + 0, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
163
6067
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
164 vsrc0uc = vsrc1uc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
165 }
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
166 } else {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
167 vec_u8_t vsrcDuc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
168 for (i = 0 ; i < h ; i++) {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
169 vsrcCuc = vec_ld(stride + 0, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
170 vsrcDuc = vec_ld(stride + 15, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
173
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
174 vsrc0uc = vsrc1uc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
175 }
6063
47ed1b9610b1 Reindent
lu_zero
parents: 6062
diff changeset
176 }
6065
180976fd652e 10l do not load after the buffer...
lu_zero
parents: 6064
diff changeset
177 } else { // y == 0 C == 0
6067
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
178 if (!loadSecond) {// -> !reallyBadAlign
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
179 for (i = 0 ; i < h ; i++) {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
180 vsrcCuc = vec_ld(0, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
181 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
6065
180976fd652e 10l do not load after the buffer...
lu_zero
parents: 6064
diff changeset
183
6067
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
185 }
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
186 } else {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
187 vec_u8_t vsrcDuc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
188 for (i = 0 ; i < h ; i++) {
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
189 vsrcCuc = vec_ld(0, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
190 vsrcDuc = vec_ld(15, src);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
192 if (reallyBadAlign)
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
193 vsrc1uc = vsrcDuc;
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
194 else
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
195 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
196
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
292269939c50 Reindent
lu_zero
parents: 6066
diff changeset
198 }
6064
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
199 }
62d040333d51 Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents: 6063
diff changeset
200 }
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
201 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
203 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
204
6059
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
205 #undef CHROMA_MC8_ALTIVEC_CORE
8c1a381bddb6 Factorize common code (almost cosmetic)
lu_zero
parents: 6058
diff changeset
206
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
207 /* this code assume stride % 16 == 0 */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
210 register int i;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
211
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
212 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
213 const vec_u8_t permM2 = vec_lvsl(-2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
214 const vec_u8_t permM1 = vec_lvsl(-1, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
215 const vec_u8_t permP0 = vec_lvsl(+0, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
216 const vec_u8_t permP1 = vec_lvsl(+1, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
217 const vec_u8_t permP2 = vec_lvsl(+2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
218 const vec_u8_t permP3 = vec_lvsl(+3, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
219 const vec_s16_t v5ss = vec_splat_s16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
220 const vec_u16_t v5us = vec_splat_u16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
221 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
222 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
223
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
224 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
225
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
226 register int align = ((((unsigned long)src) - 2) % 16);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
227
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
228 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
229 srcP2A, srcP2B, srcP3A, srcP3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
230 srcM1A, srcM1B, srcM2A, srcM2B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
233 psumA, psumB, sumA, sumB;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
234
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
235 vec_u8_t sum, vdst, fsum;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
236
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
238
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
239 for (i = 0 ; i < 16 ; i ++) {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
240 vec_u8_t srcR1 = vec_ld(-2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
241 vec_u8_t srcR2 = vec_ld(14, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
242
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
243 switch (align) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
244 default: {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
245 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
246 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
247 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
248 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
249 srcP2 = vec_perm(srcR1, srcR2, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
250 srcP3 = vec_perm(srcR1, srcR2, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
251 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
252 case 11: {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
253 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
254 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
255 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
256 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
257 srcP2 = vec_perm(srcR1, srcR2, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
258 srcP3 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
259 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
260 case 12: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
261 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
262 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
263 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
264 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
265 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
266 srcP2 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
267 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
268 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
269 case 13: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
270 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
271 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
272 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
273 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
274 srcP1 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
275 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
276 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
277 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
278 case 14: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
279 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
280 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
281 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
282 srcP0 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
283 srcP1 = vec_perm(srcR2, srcR3, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
284 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
285 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
286 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
287 case 15: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
288 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
289 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
290 srcM1 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
291 srcP0 = vec_perm(srcR2, srcR3, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
292 srcP1 = vec_perm(srcR2, srcR3, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
293 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
294 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
295 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
296 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
297
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
298 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
299 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
300 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
301 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
302
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
303 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
304 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
305 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
306 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
307
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
308 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
309 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
310 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
311 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
312
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
313 sum1A = vec_adds(srcP0A, srcP1A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
314 sum1B = vec_adds(srcP0B, srcP1B);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
315 sum2A = vec_adds(srcM1A, srcP2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
316 sum2B = vec_adds(srcM1B, srcP2B);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
317 sum3A = vec_adds(srcM2A, srcP3A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
318 sum3B = vec_adds(srcM2B, srcP3B);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
319
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
320 pp1A = vec_mladd(sum1A, v20ss, v16ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
321 pp1B = vec_mladd(sum1B, v20ss, v16ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
322
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
323 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
324 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
325
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
326 pp3A = vec_add(sum3A, pp1A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
327 pp3B = vec_add(sum3B, pp1B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
328
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
329 psumA = vec_sub(pp3A, pp2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
330 psumB = vec_sub(pp3B, pp2B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
331
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
332 sumA = vec_sra(psumA, v5us);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
333 sumB = vec_sra(psumB, v5us);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
334
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
335 sum = vec_packsu(sumA, sumB);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
336
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
337 ASSERT_ALIGNED(dst);
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
338 vdst = vec_ld(0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
339
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
340 OP_U8_ALTIVEC(fsum, sum, vdst);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
341
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
342 vec_st(fsum, 0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
343
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
344 src += srcStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
345 dst += dstStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
346 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
348 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
349
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
350 /* this code assume stride % 16 == 0 */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
351 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
353
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
354 register int i;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
355
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
356 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
357 const vec_u8_t perm = vec_lvsl(0, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
358 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
359 const vec_u16_t v5us = vec_splat_u16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
360 const vec_s16_t v5ss = vec_splat_s16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
361 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
362
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
363 uint8_t *srcbis = src - (srcStride * 2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
364
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
365 const vec_u8_t srcM2a = vec_ld(0, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
366 const vec_u8_t srcM2b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
367 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
368 // srcbis += srcStride;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
369 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
370 const vec_u8_t srcM1b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
371 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
372 // srcbis += srcStride;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
373 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
374 const vec_u8_t srcP0b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
375 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
376 // srcbis += srcStride;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
377 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
378 const vec_u8_t srcP1b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
379 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
380 // srcbis += srcStride;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
381 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
382 const vec_u8_t srcP2b = vec_ld(16, srcbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
383 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
384 // srcbis += srcStride;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
385
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
386 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
387 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
388 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
389 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
390 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
391 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
392 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
393 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
394 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
395 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
396
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
397 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
398 psumA, psumB, sumA, sumB,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
399 srcP3ssA, srcP3ssB,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
401
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
402 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
403
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
405
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
406 for (i = 0 ; i < 16 ; i++) {
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
407 srcP3a = vec_ld(0, srcbis += srcStride);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
408 srcP3b = vec_ld(16, srcbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
409 srcP3 = vec_perm(srcP3a, srcP3b, perm);
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
410 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
411 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
412 // srcbis += srcStride;
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
413
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
414 sum1A = vec_adds(srcP0ssA, srcP1ssA);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
415 sum1B = vec_adds(srcP0ssB, srcP1ssB);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
416 sum2A = vec_adds(srcM1ssA, srcP2ssA);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
417 sum2B = vec_adds(srcM1ssB, srcP2ssB);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
418 sum3A = vec_adds(srcM2ssA, srcP3ssA);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
419 sum3B = vec_adds(srcM2ssB, srcP3ssB);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
420
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
421 srcM2ssA = srcM1ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
422 srcM2ssB = srcM1ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
423 srcM1ssA = srcP0ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
424 srcM1ssB = srcP0ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
425 srcP0ssA = srcP1ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
426 srcP0ssB = srcP1ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
427 srcP1ssA = srcP2ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
428 srcP1ssB = srcP2ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
429 srcP2ssA = srcP3ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
430 srcP2ssB = srcP3ssB;
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
431
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
432 pp1A = vec_mladd(sum1A, v20ss, v16ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
433 pp1B = vec_mladd(sum1B, v20ss, v16ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
434
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
435 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
436 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
437
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
438 pp3A = vec_add(sum3A, pp1A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
439 pp3B = vec_add(sum3B, pp1B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
440
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
441 psumA = vec_sub(pp3A, pp2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
442 psumB = vec_sub(pp3B, pp2B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
443
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
444 sumA = vec_sra(psumA, v5us);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
445 sumB = vec_sra(psumB, v5us);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
446
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
447 sum = vec_packsu(sumA, sumB);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
448
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
449 ASSERT_ALIGNED(dst);
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
450 vdst = vec_ld(0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
451
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
452 OP_U8_ALTIVEC(fsum, sum, vdst);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
453
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
454 vec_st(fsum, 0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
455
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
456 dst += dstStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
457 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
459 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
460
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
464 register int i;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
465 LOAD_ZERO;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
466 const vec_u8_t permM2 = vec_lvsl(-2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
467 const vec_u8_t permM1 = vec_lvsl(-1, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
468 const vec_u8_t permP0 = vec_lvsl(+0, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
469 const vec_u8_t permP1 = vec_lvsl(+1, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
470 const vec_u8_t permP2 = vec_lvsl(+2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
471 const vec_u8_t permP3 = vec_lvsl(+3, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
472 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
473 const vec_u32_t v10ui = vec_splat_u32(10);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
474 const vec_s16_t v5ss = vec_splat_s16(5);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
475 const vec_s16_t v1ss = vec_splat_s16(1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
476 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
477 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
478
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
479 register int align = ((((unsigned long)src) - 2) % 16);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
480
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
481 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
482 srcP2A, srcP2B, srcP3A, srcP3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
483 srcM1A, srcM1B, srcM2A, srcM2B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
485 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
486
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
487 const vec_u8_t mperm = (const vec_u8_t)
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
488 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
490 int16_t *tmpbis = tmp;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
491
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
492 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
494 tmpP2ssA, tmpP2ssB;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
495
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
496 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
499 ssumAe, ssumAo, ssumBe, ssumBo;
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
500 vec_u8_t fsum, sumv, sum, vdst;
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
501 vec_s16_t ssume, ssumo;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
502
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
504 src -= (2 * srcStride);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
505 for (i = 0 ; i < 21 ; i ++) {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
506 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
507 vec_u8_t srcR1 = vec_ld(-2, src);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
508 vec_u8_t srcR2 = vec_ld(14, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
509
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
510 switch (align) {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
511 default: {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
512 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
513 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
514 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
515 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
516 srcP2 = vec_perm(srcR1, srcR2, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
517 srcP3 = vec_perm(srcR1, srcR2, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
518 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
519 case 11: {
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
520 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
521 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
522 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
523 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
524 srcP2 = vec_perm(srcR1, srcR2, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
525 srcP3 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
526 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
527 case 12: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
528 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
529 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
530 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
531 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
532 srcP1 = vec_perm(srcR1, srcR2, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
533 srcP2 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
534 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
535 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
536 case 13: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
537 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
538 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
539 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
540 srcP0 = vec_perm(srcR1, srcR2, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
541 srcP1 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
542 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
543 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
544 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
545 case 14: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
546 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
547 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
548 srcM1 = vec_perm(srcR1, srcR2, permM1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
549 srcP0 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
550 srcP1 = vec_perm(srcR2, srcR3, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
551 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
552 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
553 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
554 case 15: {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
555 vec_u8_t srcR3 = vec_ld(30, src);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
556 srcM2 = vec_perm(srcR1, srcR2, permM2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
557 srcM1 = srcR2;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
558 srcP0 = vec_perm(srcR2, srcR3, permP0);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
559 srcP1 = vec_perm(srcR2, srcR3, permP1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
560 srcP2 = vec_perm(srcR2, srcR3, permP2);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
561 srcP3 = vec_perm(srcR2, srcR3, permP3);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
562 } break;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
563 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
564
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
565 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
566 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
567 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
568 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
569
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
570 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
571 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
572 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
573 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
574
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
575 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
576 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
577 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
578 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
579
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
580 sum1A = vec_adds(srcP0A, srcP1A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
581 sum1B = vec_adds(srcP0B, srcP1B);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
582 sum2A = vec_adds(srcM1A, srcP2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
583 sum2B = vec_adds(srcM1B, srcP2B);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
584 sum3A = vec_adds(srcM2A, srcP3A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
585 sum3B = vec_adds(srcM2B, srcP3B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
586
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
587 pp1A = vec_mladd(sum1A, v20ss, sum3A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
588 pp1B = vec_mladd(sum1B, v20ss, sum3B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
589
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
590 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
591 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
592
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
593 psumA = vec_sub(pp1A, pp2A);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
594 psumB = vec_sub(pp1B, pp2B);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
595
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
596 vec_st(psumA, 0, tmp);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
597 vec_st(psumB, 16, tmp);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
598
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
599 src += srcStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
600 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
601 }
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
602
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
603 tmpM2ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
604 tmpM2ssB = vec_ld(16, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
605 tmpbis += tmpStride;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
606 tmpM1ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
607 tmpM1ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
608 tmpbis += tmpStride;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
609 tmpP0ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
610 tmpP0ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
611 tmpbis += tmpStride;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
612 tmpP1ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
613 tmpP1ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
614 tmpbis += tmpStride;
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
615 tmpP2ssA = vec_ld(0, tmpbis);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
616 tmpP2ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
617 tmpbis += tmpStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
618
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
619 for (i = 0 ; i < 16 ; i++) {
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
620 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
621 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
622
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
623 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
624 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
625 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
626 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
627 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
628 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
629
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
630 tmpbis += tmpStride;
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
631
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
632 tmpM2ssA = tmpM1ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
633 tmpM2ssB = tmpM1ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
634 tmpM1ssA = tmpP0ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
635 tmpM1ssB = tmpP0ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
636 tmpP0ssA = tmpP1ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
637 tmpP0ssB = tmpP1ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
638 tmpP1ssA = tmpP2ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
639 tmpP1ssB = tmpP2ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
640 tmpP2ssA = tmpP3ssA;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
641 tmpP2ssB = tmpP3ssB;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
642
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
643 pp1Ae = vec_mule(sum1A, v20ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
644 pp1Ao = vec_mulo(sum1A, v20ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
645 pp1Be = vec_mule(sum1B, v20ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
646 pp1Bo = vec_mulo(sum1B, v20ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
647
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
648 pp2Ae = vec_mule(sum2A, v5ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
649 pp2Ao = vec_mulo(sum2A, v5ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
650 pp2Be = vec_mule(sum2B, v5ss);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
651 pp2Bo = vec_mulo(sum2B, v5ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
652
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
653 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
654 pp3Ao = vec_mulo(sum3A, v1ss);
5530
cd266411b11a use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents: 5019
diff changeset
655 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
656 pp3Bo = vec_mulo(sum3B, v1ss);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
657
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
658 pp1cAe = vec_add(pp1Ae, v512si);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
659 pp1cAo = vec_add(pp1Ao, v512si);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
660 pp1cBe = vec_add(pp1Be, v512si);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
661 pp1cBo = vec_add(pp1Bo, v512si);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
662
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
663 pp32Ae = vec_sub(pp3Ae, pp2Ae);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
664 pp32Ao = vec_sub(pp3Ao, pp2Ao);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
665 pp32Be = vec_sub(pp3Be, pp2Be);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
666 pp32Bo = vec_sub(pp3Bo, pp2Bo);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
667
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
668 sumAe = vec_add(pp1cAe, pp32Ae);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
669 sumAo = vec_add(pp1cAo, pp32Ao);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
670 sumBe = vec_add(pp1cBe, pp32Be);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
671 sumBo = vec_add(pp1cBo, pp32Bo);
2967
ef2149182f1c COSMETICS: Remove all trailing whitespace.
diego
parents: 2236
diff changeset
672
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
673 ssumAe = vec_sra(sumAe, v10ui);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
674 ssumAo = vec_sra(sumAo, v10ui);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
675 ssumBe = vec_sra(sumBe, v10ui);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
676 ssumBo = vec_sra(sumBo, v10ui);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
677
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
678 ssume = vec_packs(ssumAe, ssumBe);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
679 ssumo = vec_packs(ssumAo, ssumBo);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
680
3346
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
681 sumv = vec_packsu(ssume, ssumo);
052765f11f1c Cosmetics: should not hurt performance, scream if are
lu_zero
parents: 3153
diff changeset
682 sum = vec_perm(sumv, sumv, mperm);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
683
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
684 ASSERT_ALIGNED(dst);
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
685 vdst = vec_ld(0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
686
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
687 OP_U8_ALTIVEC(fsum, sum, vdst);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
688
5603
861eb234e6ba remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents: 5530
diff changeset
689 vec_st(fsum, 0, dst);
2236
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
690
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
691 dst += dstStride;
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
692 }
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
b0102ea621dd h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff changeset
694 }