Mercurial > libavcodec.hg
annotate ppc/h264_template_altivec.c @ 6323:e6da66f378c7 libavcodec
mpegvideo.h has two function declarations with the 'inline' specifier
but no definition for those functions. The C standard requires a
definition to appear in the same translation unit for any function
declared with 'inline'. Most of the files including mpegvideo.h do not
define those functions. Fix this by removing the 'inline' specifiers
from the header.
patch by Uoti Urpala
author | diego |
---|---|
date | Sun, 03 Feb 2008 17:54:30 +0000 |
parents | 292269939c50 |
children | a8a79f5385f6 |
rev | line source |
---|---|
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
1 /* |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
3 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
4 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
5 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
6 * FFmpeg is free software; you can redistribute it and/or |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
7 * modify it under the terms of the GNU Lesser General Public |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
8 * License as published by the Free Software Foundation; either |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
9 * version 2.1 of the License, or (at your option) any later version. |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
10 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
11 * FFmpeg is distributed in the hope that it will be useful, |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
14 * Lesser General Public License for more details. |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
15 * |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
16 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
17 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2967
diff
changeset
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
19 */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
20 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
21 //#define DEBUG_ALIGNMENT |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
22 #ifdef DEBUG_ALIGNMENT |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
24 #else |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
25 #define ASSERT_ALIGNED(ptr) ; |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
26 #endif |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
27 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
28 /* this code assume that stride % 16 == 0 */ |
6059 | 29 |
30 #define CHROMA_MC8_ALTIVEC_CORE \ | |
31 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\ | |
32 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\ | |
33 \ | |
6061 | 34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ |
6059 | 35 psum = vec_mladd(vB, vsrc1ssH, psum);\ |
36 psum = vec_mladd(vC, vsrc2ssH, psum);\ | |
37 psum = vec_mladd(vD, vsrc3ssH, psum);\ | |
38 psum = vec_sr(psum, v6us);\ | |
39 \ | |
40 vdst = vec_ld(0, dst);\ | |
41 ppsum = (vec_u8_t)vec_pack(psum, psum);\ | |
42 vfdst = vec_perm(vdst, ppsum, fperm);\ | |
43 \ | |
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ | |
45 \ | |
46 vec_st(fsum, 0, dst);\ | |
47 \ | |
48 vsrc0ssH = vsrc2ssH;\ | |
49 vsrc1ssH = vsrc3ssH;\ | |
50 \ | |
51 dst += stride;\ | |
52 src += stride; | |
53 | |
6062
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ |
6064
62d040333d51
Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents:
6063
diff
changeset
|
55 \ |
62d040333d51
Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents:
6063
diff
changeset
|
56 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\ |
62d040333d51
Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents:
6063
diff
changeset
|
57 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\ |
6062
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
58 \ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ |
6064
62d040333d51
Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents:
6063
diff
changeset
|
60 psum = vec_mladd(vE, vsrc1ssH, psum);\ |
6062
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
61 psum = vec_sr(psum, v6us);\ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
62 \ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
63 vdst = vec_ld(0, dst);\ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
64 ppsum = (vec_u8_t)vec_pack(psum, psum);\ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
65 vfdst = vec_perm(vdst, ppsum, fperm);\ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
66 \ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
68 \ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
69 vec_st(fsum, 0, dst);\ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
70 \ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
71 dst += stride;\ |
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
72 src += stride; |
6059 | 73 |
6063 | 74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, |
75 int stride, int h, int x, int y) { | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); |
5019
41cabe79ba25
use macro Use DECLARE_ALIGNED_16 to align stack-allocated variables
gpoirier
parents:
3947
diff
changeset
|
77 DECLARE_ALIGNED_16(signed int, ABCD[4]) = |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
78 {((8 - x) * (8 - y)), |
6058 | 79 (( x) * (8 - y)), |
80 ((8 - x) * ( y)), | |
81 (( x) * ( y))}; | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
82 register int i; |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
83 vec_u8_t fperm; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
84 const vec_s32_t vABCD = vec_ld(0, ABCD); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
85 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
86 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
87 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
88 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
89 LOAD_ZERO; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
90 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
91 const vec_u16_t v6us = vec_splat_u16(6); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
2967 | 94 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
95 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
96 vec_u8_t vsrc0uc, vsrc1uc; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
97 vec_s16_t vsrc0ssH, vsrc1ssH; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
98 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
99 vec_s16_t vsrc2ssH, vsrc3ssH, psum; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
100 vec_u8_t vdst, ppsum, vfdst, fsum; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
101 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
103 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
104 if (((unsigned long)dst) % 16 == 0) { |
6060 | 105 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, |
106 0x14, 0x15, 0x16, 0x17, | |
107 0x08, 0x09, 0x0A, 0x0B, | |
108 0x0C, 0x0D, 0x0E, 0x0F); | |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
109 } else { |
6060 | 110 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, |
111 0x04, 0x05, 0x06, 0x07, | |
112 0x18, 0x19, 0x1A, 0x1B, | |
113 0x1C, 0x1D, 0x1E, 0x1F); | |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
114 } |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
115 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
116 vsrcAuc = vec_ld(0, src); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
117 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
118 if (loadSecond) |
6060 | 119 vsrcBuc = vec_ld(16, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
120 vsrcperm0 = vec_lvsl(0, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
121 vsrcperm1 = vec_lvsl(1, src); |
2967 | 122 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
123 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
124 if (reallyBadAlign) |
6060 | 125 vsrc1uc = vsrcBuc; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
126 else |
6060 | 127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
2967 | 128 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
129 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
130 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
131 |
6062
9d1590a4df90
Partially address issue299, no performance change apparently
lu_zero
parents:
6061
diff
changeset
|
132 if (ABCD[3]) { |
6063 | 133 if (!loadSecond) {// -> !reallyBadAlign |
134 for (i = 0 ; i < h ; i++) { | |
135 vsrcCuc = vec_ld(stride + 0, src); | |
136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | |
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | |
2967 | 138 |
6063 | 139 CHROMA_MC8_ALTIVEC_CORE |
140 } | |
141 } else { | |
142 vec_u8_t vsrcDuc; | |
143 for (i = 0 ; i < h ; i++) { | |
144 vsrcCuc = vec_ld(stride + 0, src); | |
145 vsrcDuc = vec_ld(stride + 16, src); | |
146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | |
147 if (reallyBadAlign) | |
148 vsrc3uc = vsrcDuc; | |
149 else | |
150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); | |
151 | |
152 CHROMA_MC8_ALTIVEC_CORE | |
153 } | |
6060 | 154 } |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
155 } else { |
6064
62d040333d51
Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents:
6063
diff
changeset
|
156 const vec_s16_t vE = vec_add(vB, vC); |
6065 | 157 if (ABCD[2]) { // x == 0 B == 0 |
6067 | 158 if (!loadSecond) {// -> !reallyBadAlign |
159 for (i = 0 ; i < h ; i++) { | |
160 vsrcCuc = vec_ld(stride + 0, src); | |
161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | |
162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE | |
2967 | 163 |
6067 | 164 vsrc0uc = vsrc1uc; |
165 } | |
166 } else { | |
167 vec_u8_t vsrcDuc; | |
168 for (i = 0 ; i < h ; i++) { | |
169 vsrcCuc = vec_ld(stride + 0, src); | |
170 vsrcDuc = vec_ld(stride + 15, src); | |
171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | |
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE | |
173 | |
174 vsrc0uc = vsrc1uc; | |
175 } | |
6063 | 176 } |
6065 | 177 } else { // y == 0 C == 0 |
6067 | 178 if (!loadSecond) {// -> !reallyBadAlign |
179 for (i = 0 ; i < h ; i++) { | |
180 vsrcCuc = vec_ld(0, src); | |
181 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | |
182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | |
6065 | 183 |
6067 | 184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE |
185 } | |
186 } else { | |
187 vec_u8_t vsrcDuc; | |
188 for (i = 0 ; i < h ; i++) { | |
189 vsrcCuc = vec_ld(0, src); | |
190 vsrcDuc = vec_ld(15, src); | |
191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | |
192 if (reallyBadAlign) | |
193 vsrc1uc = vsrcDuc; | |
194 else | |
195 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); | |
196 | |
197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE | |
198 } | |
6064
62d040333d51
Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents:
6063
diff
changeset
|
199 } |
62d040333d51
Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
lu_zero
parents:
6063
diff
changeset
|
200 } |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
201 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
203 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
204 |
6059 | 205 #undef CHROMA_MC8_ALTIVEC_CORE |
206 | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
207 /* this code assume stride % 16 == 0 */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
210 register int i; |
2967 | 211 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
212 LOAD_ZERO; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
213 const vec_u8_t permM2 = vec_lvsl(-2, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
214 const vec_u8_t permM1 = vec_lvsl(-1, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
215 const vec_u8_t permP0 = vec_lvsl(+0, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
216 const vec_u8_t permP1 = vec_lvsl(+1, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
217 const vec_u8_t permP2 = vec_lvsl(+2, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
218 const vec_u8_t permP3 = vec_lvsl(+3, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
219 const vec_s16_t v5ss = vec_splat_s16(5); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
220 const vec_u16_t v5us = vec_splat_u16(5); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
221 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
222 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
223 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
224 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
225 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
226 register int align = ((((unsigned long)src) - 2) % 16); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
227 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
228 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
229 srcP2A, srcP2B, srcP3A, srcP3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
230 srcM1A, srcM1B, srcM2A, srcM2B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
233 psumA, psumB, sumA, sumB; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
234 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
235 vec_u8_t sum, vdst, fsum; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
236 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
238 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
239 for (i = 0 ; i < 16 ; i ++) { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
240 vec_u8_t srcR1 = vec_ld(-2, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
241 vec_u8_t srcR2 = vec_ld(14, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
242 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
243 switch (align) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
244 default: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
245 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
246 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
247 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
248 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
249 srcP2 = vec_perm(srcR1, srcR2, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
250 srcP3 = vec_perm(srcR1, srcR2, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
251 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
252 case 11: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
253 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
254 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
255 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
256 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
257 srcP2 = vec_perm(srcR1, srcR2, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
258 srcP3 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
259 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
260 case 12: { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
261 vec_u8_t srcR3 = vec_ld(30, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
262 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
263 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
264 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
265 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
266 srcP2 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
267 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
268 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
269 case 13: { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
270 vec_u8_t srcR3 = vec_ld(30, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
271 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
272 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
273 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
274 srcP1 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
275 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
276 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
277 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
278 case 14: { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
279 vec_u8_t srcR3 = vec_ld(30, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
280 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
281 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
282 srcP0 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
283 srcP1 = vec_perm(srcR2, srcR3, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
284 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
285 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
286 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
287 case 15: { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
288 vec_u8_t srcR3 = vec_ld(30, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
289 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
290 srcM1 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
291 srcP0 = vec_perm(srcR2, srcR3, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
292 srcP1 = vec_perm(srcR2, srcR3, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
293 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
294 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
295 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
296 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
297 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
298 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
299 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
300 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
301 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
302 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
303 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
304 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
305 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
306 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
307 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
308 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
309 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
310 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
311 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
312 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
313 sum1A = vec_adds(srcP0A, srcP1A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
314 sum1B = vec_adds(srcP0B, srcP1B); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
315 sum2A = vec_adds(srcM1A, srcP2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
316 sum2B = vec_adds(srcM1B, srcP2B); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
317 sum3A = vec_adds(srcM2A, srcP3A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
318 sum3B = vec_adds(srcM2B, srcP3B); |
2967 | 319 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
320 pp1A = vec_mladd(sum1A, v20ss, v16ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
321 pp1B = vec_mladd(sum1B, v20ss, v16ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
322 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
323 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
324 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
2967 | 325 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
326 pp3A = vec_add(sum3A, pp1A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
327 pp3B = vec_add(sum3B, pp1B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
328 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
329 psumA = vec_sub(pp3A, pp2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
330 psumB = vec_sub(pp3B, pp2B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
331 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
332 sumA = vec_sra(psumA, v5us); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
333 sumB = vec_sra(psumB, v5us); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
334 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
335 sum = vec_packsu(sumA, sumB); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
336 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
337 ASSERT_ALIGNED(dst); |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
338 vdst = vec_ld(0, dst); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
339 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
340 OP_U8_ALTIVEC(fsum, sum, vdst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
341 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
342 vec_st(fsum, 0, dst); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
343 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
344 src += srcStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
345 dst += dstStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
346 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
348 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
349 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
350 /* this code assume stride % 16 == 0 */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
351 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); |
2967 | 353 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
354 register int i; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
355 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
356 LOAD_ZERO; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
357 const vec_u8_t perm = vec_lvsl(0, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
358 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
359 const vec_u16_t v5us = vec_splat_u16(5); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
360 const vec_s16_t v5ss = vec_splat_s16(5); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
361 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
2967 | 362 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
363 uint8_t *srcbis = src - (srcStride * 2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
364 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
365 const vec_u8_t srcM2a = vec_ld(0, srcbis); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
366 const vec_u8_t srcM2b = vec_ld(16, srcbis); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
367 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
368 // srcbis += srcStride; |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
369 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
370 const vec_u8_t srcM1b = vec_ld(16, srcbis); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
371 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
372 // srcbis += srcStride; |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
373 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
374 const vec_u8_t srcP0b = vec_ld(16, srcbis); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
375 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
376 // srcbis += srcStride; |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
377 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
378 const vec_u8_t srcP1b = vec_ld(16, srcbis); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
379 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
380 // srcbis += srcStride; |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
381 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
382 const vec_u8_t srcP2b = vec_ld(16, srcbis); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
383 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
384 // srcbis += srcStride; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
385 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
386 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
387 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
388 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
389 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
390 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
391 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
392 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
393 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
394 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
395 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
396 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
397 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
398 psumA, psumB, sumA, sumB, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
399 srcP3ssA, srcP3ssB, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
401 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
402 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
403 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
405 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
406 for (i = 0 ; i < 16 ; i++) { |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
407 srcP3a = vec_ld(0, srcbis += srcStride); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
408 srcP3b = vec_ld(16, srcbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
409 srcP3 = vec_perm(srcP3a, srcP3b, perm); |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
410 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
411 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
412 // srcbis += srcStride; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
413 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
414 sum1A = vec_adds(srcP0ssA, srcP1ssA); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
415 sum1B = vec_adds(srcP0ssB, srcP1ssB); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
416 sum2A = vec_adds(srcM1ssA, srcP2ssA); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
417 sum2B = vec_adds(srcM1ssB, srcP2ssB); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
418 sum3A = vec_adds(srcM2ssA, srcP3ssA); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
419 sum3B = vec_adds(srcM2ssB, srcP3ssB); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
420 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
421 srcM2ssA = srcM1ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
422 srcM2ssB = srcM1ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
423 srcM1ssA = srcP0ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
424 srcM1ssB = srcP0ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
425 srcP0ssA = srcP1ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
426 srcP0ssB = srcP1ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
427 srcP1ssA = srcP2ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
428 srcP1ssB = srcP2ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
429 srcP2ssA = srcP3ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
430 srcP2ssB = srcP3ssB; |
2967 | 431 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
432 pp1A = vec_mladd(sum1A, v20ss, v16ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
433 pp1B = vec_mladd(sum1B, v20ss, v16ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
434 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
435 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
436 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
2967 | 437 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
438 pp3A = vec_add(sum3A, pp1A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
439 pp3B = vec_add(sum3B, pp1B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
440 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
441 psumA = vec_sub(pp3A, pp2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
442 psumB = vec_sub(pp3B, pp2B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
443 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
444 sumA = vec_sra(psumA, v5us); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
445 sumB = vec_sra(psumB, v5us); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
446 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
447 sum = vec_packsu(sumA, sumB); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
448 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
449 ASSERT_ALIGNED(dst); |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
450 vdst = vec_ld(0, dst); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
451 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
452 OP_U8_ALTIVEC(fsum, sum, vdst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
453 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
454 vec_st(fsum, 0, dst); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
455 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
456 dst += dstStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
457 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
459 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
460 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
464 register int i; |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
465 LOAD_ZERO; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
466 const vec_u8_t permM2 = vec_lvsl(-2, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
467 const vec_u8_t permM1 = vec_lvsl(-1, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
468 const vec_u8_t permP0 = vec_lvsl(+0, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
469 const vec_u8_t permP1 = vec_lvsl(+1, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
470 const vec_u8_t permP2 = vec_lvsl(+2, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
471 const vec_u8_t permP3 = vec_lvsl(+3, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
472 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
473 const vec_u32_t v10ui = vec_splat_u32(10); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
474 const vec_s16_t v5ss = vec_splat_s16(5); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
475 const vec_s16_t v1ss = vec_splat_s16(1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
476 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
477 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
478 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
479 register int align = ((((unsigned long)src) - 2) % 16); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
480 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
481 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
482 srcP2A, srcP2B, srcP3A, srcP3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
483 srcM1A, srcM1B, srcM2A, srcM2B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
485 pp1A, pp1B, pp2A, pp2B, psumA, psumB; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
486 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
487 const vec_u8_t mperm = (const vec_u8_t) |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
488 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
490 int16_t *tmpbis = tmp; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
491 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
492 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
494 tmpP2ssA, tmpP2ssB; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
495 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
496 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
499 ssumAe, ssumAo, ssumBe, ssumBo; |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
500 vec_u8_t fsum, sumv, sum, vdst; |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
501 vec_s16_t ssume, ssumo; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
502 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
504 src -= (2 * srcStride); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
505 for (i = 0 ; i < 21 ; i ++) { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
506 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
507 vec_u8_t srcR1 = vec_ld(-2, src); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
508 vec_u8_t srcR2 = vec_ld(14, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
509 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
510 switch (align) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
511 default: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
512 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
513 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
514 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
515 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
516 srcP2 = vec_perm(srcR1, srcR2, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
517 srcP3 = vec_perm(srcR1, srcR2, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
518 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
519 case 11: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
520 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
521 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
522 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
523 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
524 srcP2 = vec_perm(srcR1, srcR2, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
525 srcP3 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
526 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
527 case 12: { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
528 vec_u8_t srcR3 = vec_ld(30, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
529 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
530 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
531 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
532 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
533 srcP2 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
534 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
535 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
536 case 13: { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
537 vec_u8_t srcR3 = vec_ld(30, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
538 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
539 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
540 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
541 srcP1 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
542 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
543 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
544 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
545 case 14: { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
546 vec_u8_t srcR3 = vec_ld(30, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
547 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
548 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
549 srcP0 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
550 srcP1 = vec_perm(srcR2, srcR3, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
551 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
552 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
553 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
554 case 15: { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
555 vec_u8_t srcR3 = vec_ld(30, src); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
556 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
557 srcM1 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
558 srcP0 = vec_perm(srcR2, srcR3, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
559 srcP1 = vec_perm(srcR2, srcR3, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
560 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
561 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
562 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
563 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
564 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
565 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
566 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
567 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
568 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
569 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
570 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
571 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
572 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
573 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
574 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
575 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
576 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
577 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
578 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); |
2967 | 579 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
580 sum1A = vec_adds(srcP0A, srcP1A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
581 sum1B = vec_adds(srcP0B, srcP1B); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
582 sum2A = vec_adds(srcM1A, srcP2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
583 sum2B = vec_adds(srcM1B, srcP2B); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
584 sum3A = vec_adds(srcM2A, srcP3A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
585 sum3B = vec_adds(srcM2B, srcP3B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
586 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
587 pp1A = vec_mladd(sum1A, v20ss, sum3A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
588 pp1B = vec_mladd(sum1B, v20ss, sum3B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
589 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
590 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
591 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
592 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
593 psumA = vec_sub(pp1A, pp2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
594 psumB = vec_sub(pp1B, pp2B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
595 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
596 vec_st(psumA, 0, tmp); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
597 vec_st(psumB, 16, tmp); |
2967 | 598 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
599 src += srcStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
600 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
601 } |
2967 | 602 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
603 tmpM2ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
604 tmpM2ssB = vec_ld(16, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
605 tmpbis += tmpStride; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
606 tmpM1ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
607 tmpM1ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
608 tmpbis += tmpStride; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
609 tmpP0ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
610 tmpP0ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
611 tmpbis += tmpStride; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
612 tmpP1ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
613 tmpP1ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
614 tmpbis += tmpStride; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
615 tmpP2ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
616 tmpP2ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
617 tmpbis += tmpStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
618 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
619 for (i = 0 ; i < 16 ; i++) { |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
620 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
621 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
622 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
623 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
624 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
625 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
626 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
627 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); |
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
628 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
629 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
630 tmpbis += tmpStride; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
631 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
632 tmpM2ssA = tmpM1ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
633 tmpM2ssB = tmpM1ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
634 tmpM1ssA = tmpP0ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
635 tmpM1ssB = tmpP0ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
636 tmpP0ssA = tmpP1ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
637 tmpP0ssB = tmpP1ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
638 tmpP1ssA = tmpP2ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
639 tmpP1ssB = tmpP2ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
640 tmpP2ssA = tmpP3ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
641 tmpP2ssB = tmpP3ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
642 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
643 pp1Ae = vec_mule(sum1A, v20ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
644 pp1Ao = vec_mulo(sum1A, v20ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
645 pp1Be = vec_mule(sum1B, v20ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
646 pp1Bo = vec_mulo(sum1B, v20ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
647 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
648 pp2Ae = vec_mule(sum2A, v5ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
649 pp2Ao = vec_mulo(sum2A, v5ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
650 pp2Be = vec_mule(sum2B, v5ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
651 pp2Bo = vec_mulo(sum2B, v5ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
652 |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
653 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
654 pp3Ao = vec_mulo(sum3A, v1ss); |
5530
cd266411b11a
use shorter types vec_"type" instead of the too long vector "type"
gpoirier
parents:
5019
diff
changeset
|
655 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
656 pp3Bo = vec_mulo(sum3B, v1ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
657 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
658 pp1cAe = vec_add(pp1Ae, v512si); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
659 pp1cAo = vec_add(pp1Ao, v512si); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
660 pp1cBe = vec_add(pp1Be, v512si); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
661 pp1cBo = vec_add(pp1Bo, v512si); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
662 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
663 pp32Ae = vec_sub(pp3Ae, pp2Ae); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
664 pp32Ao = vec_sub(pp3Ao, pp2Ao); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
665 pp32Be = vec_sub(pp3Be, pp2Be); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
666 pp32Bo = vec_sub(pp3Bo, pp2Bo); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
667 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
668 sumAe = vec_add(pp1cAe, pp32Ae); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
669 sumAo = vec_add(pp1cAo, pp32Ao); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
670 sumBe = vec_add(pp1cBe, pp32Be); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
671 sumBo = vec_add(pp1cBo, pp32Bo); |
2967 | 672 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
673 ssumAe = vec_sra(sumAe, v10ui); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
674 ssumAo = vec_sra(sumAo, v10ui); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
675 ssumBe = vec_sra(sumBe, v10ui); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
676 ssumBo = vec_sra(sumBo, v10ui); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
677 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
678 ssume = vec_packs(ssumAe, ssumBe); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
679 ssumo = vec_packs(ssumAo, ssumBo); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
680 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
681 sumv = vec_packsu(ssume, ssumo); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
682 sum = vec_perm(sumv, sumv, mperm); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
683 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
684 ASSERT_ALIGNED(dst); |
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
685 vdst = vec_ld(0, dst); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
686 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
687 OP_U8_ALTIVEC(fsum, sum, vdst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
688 |
5603
861eb234e6ba
remove alignment correction of the destination pointers in luma_16x6
gpoirier
parents:
5530
diff
changeset
|
689 vec_st(fsum, 0, dst); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
690 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
691 dst += dstStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
692 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
694 } |