Mercurial > libavcodec.hg
annotate ppc/h264_template_altivec.c @ 5310:9aa9197034d7 libavcodec
AC-3 decoder, soc revision 40, Aug 9 00:10:14 2006 UTC by cloud9
More code cleanup.
Window is now runtime generated.
Fixed the bugs in rematrixing routine and
in Decoding AC3 Bitstreams when coupling is in use.
Still struggling to find out what affects the quality of
the produced sound. Can anybody have a look at the
imdct routines do_imdct_256 and do_imdct_512 and tell me
whether it is the correctly implemented as described in
standard.
author | jbr |
---|---|
date | Sat, 14 Jul 2007 15:57:51 +0000 |
parents | 41cabe79ba25 |
children | cd266411b11a |
rev | line source |
---|---|
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
1 /* |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
3 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
4 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
5 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
6 * FFmpeg is free software; you can redistribute it and/or |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
7 * modify it under the terms of the GNU Lesser General Public |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
8 * License as published by the Free Software Foundation; either |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
9 * version 2.1 of the License, or (at your option) any later version. |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
10 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
11 * FFmpeg is distributed in the hope that it will be useful, |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
14 * Lesser General Public License for more details. |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
15 * |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
16 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
17 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2967
diff
changeset
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
19 */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
20 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
21 /* this code assume that stride % 16 == 0 */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
22 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
23 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); |
5019
41cabe79ba25
use macro Use DECLARE_ALIGNED_16 to align stack-allocated variables
gpoirier
parents:
3947
diff
changeset
|
24 DECLARE_ALIGNED_16(signed int, ABCD[4]) = |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
25 {((8 - x) * (8 - y)), |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
26 ((x) * (8 - y)), |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
27 ((8 - x) * (y)), |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
28 ((x) * (y))}; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
29 register int i; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
30 vector unsigned char fperm; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
31 const vector signed int vABCD = vec_ld(0, ABCD); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
32 const vector signed short vA = vec_splat((vector signed short)vABCD, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
33 const vector signed short vB = vec_splat((vector signed short)vABCD, 3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
34 const vector signed short vC = vec_splat((vector signed short)vABCD, 5); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
35 const vector signed short vD = vec_splat((vector signed short)vABCD, 7); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
36 const vector signed int vzero = vec_splat_s32(0); |
3137 | 37 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
38 const vector unsigned short v6us = vec_splat_u16(6); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
39 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
40 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
2967 | 41 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
42 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
43 vector unsigned char vsrc0uc, vsrc1uc; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
44 vector signed short vsrc0ssH, vsrc1ssH; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
45 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
46 vector signed short vsrc2ssH, vsrc3ssH, psum; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
47 vector unsigned char vdst, ppsum, vfdst, fsum; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
48 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
49 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
50 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
51 if (((unsigned long)dst) % 16 == 0) { |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
52 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
53 0x14, 0x15, 0x16, 0x17, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
54 0x08, 0x09, 0x0A, 0x0B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
55 0x0C, 0x0D, 0x0E, 0x0F); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
56 } else { |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
57 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
58 0x04, 0x05, 0x06, 0x07, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
59 0x18, 0x19, 0x1A, 0x1B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
60 0x1C, 0x1D, 0x1E, 0x1F); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
61 } |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
62 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
63 vsrcAuc = vec_ld(0, src); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
64 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
65 if (loadSecond) |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
66 vsrcBuc = vec_ld(16, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
67 vsrcperm0 = vec_lvsl(0, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
68 vsrcperm1 = vec_lvsl(1, src); |
2967 | 69 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
70 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
71 if (reallyBadAlign) |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
72 vsrc1uc = vsrcBuc; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
73 else |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
74 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
2967 | 75 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
76 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
77 (vector unsigned char)vsrc0uc); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
78 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
79 (vector unsigned char)vsrc1uc); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
80 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
81 if (!loadSecond) {// -> !reallyBadAlign |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
82 for (i = 0 ; i < h ; i++) { |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
83 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
84 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
85 vsrcCuc = vec_ld(stride + 0, src); |
2967 | 86 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
87 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
88 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
2967 | 89 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
90 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
91 (vector unsigned char)vsrc2uc); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
92 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
93 (vector unsigned char)vsrc3uc); |
2967 | 94 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
95 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
96 psum = vec_mladd(vB, vsrc1ssH, psum); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
97 psum = vec_mladd(vC, vsrc2ssH, psum); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
98 psum = vec_mladd(vD, vsrc3ssH, psum); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
99 psum = vec_add(v32ss, psum); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
100 psum = vec_sra(psum, v6us); |
2967 | 101 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
102 vdst = vec_ld(0, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
103 ppsum = (vector unsigned char)vec_packsu(psum, psum); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
104 vfdst = vec_perm(vdst, ppsum, fperm); |
2967 | 105 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
106 OP_U8_ALTIVEC(fsum, vfdst, vdst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
107 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
108 vec_st(fsum, 0, dst); |
2967 | 109 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
110 vsrc0ssH = vsrc2ssH; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
111 vsrc1ssH = vsrc3ssH; |
2967 | 112 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
113 dst += stride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
114 src += stride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
115 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
116 } else { |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
117 vector unsigned char vsrcDuc; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
118 for (i = 0 ; i < h ; i++) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
119 vsrcCuc = vec_ld(stride + 0, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
120 vsrcDuc = vec_ld(stride + 16, src); |
2967 | 121 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
122 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
123 if (reallyBadAlign) |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
124 vsrc3uc = vsrcDuc; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
125 else |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
126 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
2967 | 127 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
128 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
129 (vector unsigned char)vsrc2uc); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
130 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
131 (vector unsigned char)vsrc3uc); |
2967 | 132 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
133 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
134 psum = vec_mladd(vB, vsrc1ssH, psum); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
135 psum = vec_mladd(vC, vsrc2ssH, psum); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
136 psum = vec_mladd(vD, vsrc3ssH, psum); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
137 psum = vec_add(v32ss, psum); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
138 psum = vec_sr(psum, v6us); |
2967 | 139 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
140 vdst = vec_ld(0, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
141 ppsum = (vector unsigned char)vec_pack(psum, psum); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
142 vfdst = vec_perm(vdst, ppsum, fperm); |
2967 | 143 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
144 OP_U8_ALTIVEC(fsum, vfdst, vdst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
145 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
146 vec_st(fsum, 0, dst); |
2967 | 147 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
148 vsrc0ssH = vsrc2ssH; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
149 vsrc1ssH = vsrc3ssH; |
2967 | 150 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
151 dst += stride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
152 src += stride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
153 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
154 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
155 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
156 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
157 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
158 /* this code assume stride % 16 == 0 */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
159 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
160 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
161 register int i; |
2967 | 162 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
163 const vector signed int vzero = vec_splat_s32(0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
164 const vector unsigned char permM2 = vec_lvsl(-2, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
165 const vector unsigned char permM1 = vec_lvsl(-1, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
166 const vector unsigned char permP0 = vec_lvsl(+0, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
167 const vector unsigned char permP1 = vec_lvsl(+1, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
168 const vector unsigned char permP2 = vec_lvsl(+2, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
169 const vector unsigned char permP3 = vec_lvsl(+3, src); |
3137 | 170 const vector signed short v5ss = vec_splat_s16(5); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
171 const vector unsigned short v5us = vec_splat_u16(5); |
3137 | 172 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
173 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
174 const vector unsigned char dstperm = vec_lvsr(0, dst); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
175 const vector unsigned char neg1 = |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
176 (const vector unsigned char) vec_splat_s8(-1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
177 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
178 const vector unsigned char dstmask = |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
179 vec_perm((const vector unsigned char)vzero, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
180 neg1, dstperm); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
181 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
182 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
183 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
184 register int align = ((((unsigned long)src) - 2) % 16); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
185 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
186 vector signed short srcP0A, srcP0B, srcP1A, srcP1B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
187 srcP2A, srcP2B, srcP3A, srcP3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
188 srcM1A, srcM1B, srcM2A, srcM2B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
189 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
190 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
191 psumA, psumB, sumA, sumB; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
192 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
193 vector unsigned char sum, dst1, dst2, vdst, fsum, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
194 rsum, fdst1, fdst2; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
195 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
196 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
197 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
198 for (i = 0 ; i < 16 ; i ++) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
199 vector unsigned char srcR1 = vec_ld(-2, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
200 vector unsigned char srcR2 = vec_ld(14, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
201 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
202 switch (align) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
203 default: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
204 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
205 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
206 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
207 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
208 srcP2 = vec_perm(srcR1, srcR2, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
209 srcP3 = vec_perm(srcR1, srcR2, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
210 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
211 case 11: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
212 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
213 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
214 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
215 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
216 srcP2 = vec_perm(srcR1, srcR2, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
217 srcP3 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
218 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
219 case 12: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
220 vector unsigned char srcR3 = vec_ld(30, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
221 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
222 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
223 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
224 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
225 srcP2 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
226 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
227 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
228 case 13: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
229 vector unsigned char srcR3 = vec_ld(30, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
230 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
231 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
232 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
233 srcP1 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
234 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
235 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
236 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
237 case 14: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
238 vector unsigned char srcR3 = vec_ld(30, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
239 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
240 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
241 srcP0 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
242 srcP1 = vec_perm(srcR2, srcR3, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
243 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
244 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
245 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
246 case 15: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
247 vector unsigned char srcR3 = vec_ld(30, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
248 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
249 srcM1 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
250 srcP0 = vec_perm(srcR2, srcR3, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
251 srcP1 = vec_perm(srcR2, srcR3, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
252 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
253 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
254 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
255 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
256 |
3350
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
257 srcP0A = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
258 vec_mergeh((vector unsigned char)vzero, srcP0); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
259 srcP0B = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
260 vec_mergel((vector unsigned char)vzero, srcP0); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
261 srcP1A = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
262 vec_mergeh((vector unsigned char)vzero, srcP1); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
263 srcP1B = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
264 vec_mergel((vector unsigned char)vzero, srcP1); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
265 |
3350
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
266 srcP2A = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
267 vec_mergeh((vector unsigned char)vzero, srcP2); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
268 srcP2B = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
269 vec_mergel((vector unsigned char)vzero, srcP2); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
270 srcP3A = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
271 vec_mergeh((vector unsigned char)vzero, srcP3); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
272 srcP3B = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
273 vec_mergel((vector unsigned char)vzero, srcP3); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
274 |
3350
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
275 srcM1A = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
276 vec_mergeh((vector unsigned char)vzero, srcM1); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
277 srcM1B = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
278 vec_mergel((vector unsigned char)vzero, srcM1); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
279 srcM2A = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
280 vec_mergeh((vector unsigned char)vzero, srcM2); |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
281 srcM2B = (vector signed short) |
eff63ac2b545
Make gcc-3 happy again, thanks to Olivier Castan <castan.o@free.fr> for pointing the issue
lu_zero
parents:
3346
diff
changeset
|
282 vec_mergel((vector unsigned char)vzero, srcM2); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
283 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
284 sum1A = vec_adds(srcP0A, srcP1A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
285 sum1B = vec_adds(srcP0B, srcP1B); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
286 sum2A = vec_adds(srcM1A, srcP2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
287 sum2B = vec_adds(srcM1B, srcP2B); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
288 sum3A = vec_adds(srcM2A, srcP3A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
289 sum3B = vec_adds(srcM2B, srcP3B); |
2967 | 290 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
291 pp1A = vec_mladd(sum1A, v20ss, v16ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
292 pp1B = vec_mladd(sum1B, v20ss, v16ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
293 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
294 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
295 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); |
2967 | 296 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
297 pp3A = vec_add(sum3A, pp1A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
298 pp3B = vec_add(sum3B, pp1B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
299 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
300 psumA = vec_sub(pp3A, pp2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
301 psumB = vec_sub(pp3B, pp2B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
302 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
303 sumA = vec_sra(psumA, v5us); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
304 sumB = vec_sra(psumB, v5us); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
305 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
306 sum = vec_packsu(sumA, sumB); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
307 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
308 dst1 = vec_ld(0, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
309 dst2 = vec_ld(16, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
310 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
311 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
312 OP_U8_ALTIVEC(fsum, sum, vdst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
313 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
314 rsum = vec_perm(fsum, fsum, dstperm); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
315 fdst1 = vec_sel(dst1, rsum, dstmask); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
316 fdst2 = vec_sel(rsum, dst2, dstmask); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
317 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
318 vec_st(fdst1, 0, dst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
319 vec_st(fdst2, 16, dst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
320 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
321 src += srcStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
322 dst += dstStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
323 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
324 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
325 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
326 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
327 /* this code assume stride % 16 == 0 */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
328 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
329 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); |
2967 | 330 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
331 register int i; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
332 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
333 const vector signed int vzero = vec_splat_s32(0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
334 const vector unsigned char perm = vec_lvsl(0, src); |
3137 | 335 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
336 const vector unsigned short v5us = vec_splat_u16(5); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
337 const vector signed short v5ss = vec_splat_s16(5); |
3137 | 338 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
339 const vector unsigned char dstperm = vec_lvsr(0, dst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
340 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
341 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); |
2967 | 342 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
343 uint8_t *srcbis = src - (srcStride * 2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
344 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
345 const vector unsigned char srcM2a = vec_ld(0, srcbis); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
346 const vector unsigned char srcM2b = vec_ld(16, srcbis); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
347 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
348 // srcbis += srcStride; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
349 const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
350 const vector unsigned char srcM1b = vec_ld(16, srcbis); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
351 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
352 // srcbis += srcStride; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
353 const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
354 const vector unsigned char srcP0b = vec_ld(16, srcbis); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
355 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
356 // srcbis += srcStride; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
357 const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
358 const vector unsigned char srcP1b = vec_ld(16, srcbis); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
359 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
360 // srcbis += srcStride; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
361 const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
362 const vector unsigned char srcP2b = vec_ld(16, srcbis); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
363 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
364 // srcbis += srcStride; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
365 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
366 vector signed short srcM2ssA = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
367 vec_mergeh((vector unsigned char)vzero, srcM2); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
368 vector signed short srcM2ssB = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
369 vec_mergel((vector unsigned char)vzero, srcM2); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
370 vector signed short srcM1ssA = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
371 vec_mergeh((vector unsigned char)vzero, srcM1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
372 vector signed short srcM1ssB = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
373 vec_mergel((vector unsigned char)vzero, srcM1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
374 vector signed short srcP0ssA = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
375 vec_mergeh((vector unsigned char)vzero, srcP0); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
376 vector signed short srcP0ssB = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
377 vec_mergel((vector unsigned char)vzero, srcP0); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
378 vector signed short srcP1ssA = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
379 vec_mergeh((vector unsigned char)vzero, srcP1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
380 vector signed short srcP1ssB = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
381 vec_mergel((vector unsigned char)vzero, srcP1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
382 vector signed short srcP2ssA = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
383 vec_mergeh((vector unsigned char)vzero, srcP2); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
384 vector signed short srcP2ssB = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
385 vec_mergel((vector unsigned char)vzero, srcP2); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
386 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
387 vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
388 psumA, psumB, sumA, sumB, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
389 srcP3ssA, srcP3ssB, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
390 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
391 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
392 vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
393 srcP3a, srcP3b, srcP3; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
394 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
395 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
396 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
397 for (i = 0 ; i < 16 ; i++) { |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
398 srcP3a = vec_ld(0, srcbis += srcStride); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
399 srcP3b = vec_ld(16, srcbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
400 srcP3 = vec_perm(srcP3a, srcP3b, perm); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
401 srcP3ssA = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
402 vec_mergeh((vector unsigned char)vzero, srcP3); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
403 srcP3ssB = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
404 vec_mergel((vector unsigned char)vzero, srcP3); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
405 // srcbis += srcStride; |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
406 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
407 sum1A = vec_adds(srcP0ssA, srcP1ssA); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
408 sum1B = vec_adds(srcP0ssB, srcP1ssB); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
409 sum2A = vec_adds(srcM1ssA, srcP2ssA); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
410 sum2B = vec_adds(srcM1ssB, srcP2ssB); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
411 sum3A = vec_adds(srcM2ssA, srcP3ssA); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
412 sum3B = vec_adds(srcM2ssB, srcP3ssB); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
413 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
414 srcM2ssA = srcM1ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
415 srcM2ssB = srcM1ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
416 srcM1ssA = srcP0ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
417 srcM1ssB = srcP0ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
418 srcP0ssA = srcP1ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
419 srcP0ssB = srcP1ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
420 srcP1ssA = srcP2ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
421 srcP1ssB = srcP2ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
422 srcP2ssA = srcP3ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
423 srcP2ssB = srcP3ssB; |
2967 | 424 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
425 pp1A = vec_mladd(sum1A, v20ss, v16ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
426 pp1B = vec_mladd(sum1B, v20ss, v16ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
427 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
428 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
429 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); |
2967 | 430 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
431 pp3A = vec_add(sum3A, pp1A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
432 pp3B = vec_add(sum3B, pp1B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
433 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
434 psumA = vec_sub(pp3A, pp2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
435 psumB = vec_sub(pp3B, pp2B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
436 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
437 sumA = vec_sra(psumA, v5us); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
438 sumB = vec_sra(psumB, v5us); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
439 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
440 sum = vec_packsu(sumA, sumB); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
441 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
442 dst1 = vec_ld(0, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
443 dst2 = vec_ld(16, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
444 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
445 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
446 OP_U8_ALTIVEC(fsum, sum, vdst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
447 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
448 rsum = vec_perm(fsum, fsum, dstperm); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
449 fdst1 = vec_sel(dst1, rsum, dstmask); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
450 fdst2 = vec_sel(rsum, dst2, dstmask); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
451 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
452 vec_st(fdst1, 0, dst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
453 vec_st(fdst2, 16, dst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
454 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
455 dst += dstStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
456 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
457 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
458 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
459 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
460 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
461 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
462 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
463 register int i; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
464 const vector signed int vzero = vec_splat_s32(0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
465 const vector unsigned char permM2 = vec_lvsl(-2, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
466 const vector unsigned char permM1 = vec_lvsl(-1, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
467 const vector unsigned char permP0 = vec_lvsl(+0, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
468 const vector unsigned char permP1 = vec_lvsl(+1, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
469 const vector unsigned char permP2 = vec_lvsl(+2, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
470 const vector unsigned char permP3 = vec_lvsl(+3, src); |
3137 | 471 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
472 const vector unsigned int v10ui = vec_splat_u32(10); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
473 const vector signed short v5ss = vec_splat_s16(5); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
474 const vector signed short v1ss = vec_splat_s16(1); |
3137 | 475 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); |
3153
7aa01243b4d3
use vec_splat_u32 to initialise a v16ui (patch by Likai Liu < liulk _at_ cs.bu.edu >)
aurel
parents:
3137
diff
changeset
|
476 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
477 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
478 register int align = ((((unsigned long)src) - 2) % 16); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
479 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
480 const vector unsigned char neg1 = (const vector unsigned char) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
481 vec_splat_s8(-1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
482 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
483 vector signed short srcP0A, srcP0B, srcP1A, srcP1B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
484 srcP2A, srcP2B, srcP3A, srcP3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
485 srcM1A, srcM1B, srcM2A, srcM2B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
486 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
487 pp1A, pp1B, pp2A, pp2B, psumA, psumB; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
488 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
489 const vector unsigned char dstperm = vec_lvsr(0, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
490 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
491 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
492 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
493 const vector unsigned char mperm = (const vector unsigned char) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
494 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
495 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
496 int16_t *tmpbis = tmp; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
497 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
498 vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
499 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
500 tmpP2ssA, tmpP2ssB; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
501 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
502 vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
503 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
504 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
505 ssumAe, ssumAo, ssumBe, ssumBo; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
506 vector unsigned char fsum, sumv, sum, dst1, dst2, vdst, |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
507 rsum, fdst1, fdst2; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
508 vector signed short ssume, ssumo; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
509 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
510 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
511 src -= (2 * srcStride); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
512 for (i = 0 ; i < 21 ; i ++) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
513 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
514 vector unsigned char srcR1 = vec_ld(-2, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
515 vector unsigned char srcR2 = vec_ld(14, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
516 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
517 switch (align) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
518 default: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
519 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
520 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
521 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
522 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
523 srcP2 = vec_perm(srcR1, srcR2, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
524 srcP3 = vec_perm(srcR1, srcR2, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
525 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
526 case 11: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
527 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
528 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
529 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
530 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
531 srcP2 = vec_perm(srcR1, srcR2, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
532 srcP3 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
533 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
534 case 12: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
535 vector unsigned char srcR3 = vec_ld(30, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
536 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
537 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
538 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
539 srcP1 = vec_perm(srcR1, srcR2, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
540 srcP2 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
541 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
542 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
543 case 13: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
544 vector unsigned char srcR3 = vec_ld(30, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
545 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
546 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
547 srcP0 = vec_perm(srcR1, srcR2, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
548 srcP1 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
549 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
550 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
551 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
552 case 14: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
553 vector unsigned char srcR3 = vec_ld(30, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
554 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
555 srcM1 = vec_perm(srcR1, srcR2, permM1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
556 srcP0 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
557 srcP1 = vec_perm(srcR2, srcR3, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
558 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
559 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
560 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
561 case 15: { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
562 vector unsigned char srcR3 = vec_ld(30, src); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
563 srcM2 = vec_perm(srcR1, srcR2, permM2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
564 srcM1 = srcR2; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
565 srcP0 = vec_perm(srcR2, srcR3, permP0); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
566 srcP1 = vec_perm(srcR2, srcR3, permP1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
567 srcP2 = vec_perm(srcR2, srcR3, permP2); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
568 srcP3 = vec_perm(srcR2, srcR3, permP3); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
569 } break; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
570 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
571 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
572 srcP0A = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
573 vec_mergeh((vector unsigned char)vzero, srcP0); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
574 srcP0B = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
575 vec_mergel((vector unsigned char)vzero, srcP0); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
576 srcP1A = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
577 vec_mergeh((vector unsigned char)vzero, srcP1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
578 srcP1B = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
579 vec_mergel((vector unsigned char)vzero, srcP1); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
580 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
581 srcP2A = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
582 vec_mergeh((vector unsigned char)vzero, srcP2); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
583 srcP2B = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
584 vec_mergel((vector unsigned char)vzero, srcP2); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
585 srcP3A = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
586 vec_mergeh((vector unsigned char)vzero, srcP3); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
587 srcP3B = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
588 vec_mergel((vector unsigned char)vzero, srcP3); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
589 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
590 srcM1A = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
591 vec_mergeh((vector unsigned char)vzero, srcM1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
592 srcM1B = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
593 vec_mergel((vector unsigned char)vzero, srcM1); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
594 srcM2A = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
595 vec_mergeh((vector unsigned char)vzero, srcM2); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
596 srcM2B = (vector signed short) |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
597 vec_mergel((vector unsigned char)vzero, srcM2); |
2967 | 598 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
599 sum1A = vec_adds(srcP0A, srcP1A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
600 sum1B = vec_adds(srcP0B, srcP1B); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
601 sum2A = vec_adds(srcM1A, srcP2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
602 sum2B = vec_adds(srcM1B, srcP2B); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
603 sum3A = vec_adds(srcM2A, srcP3A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
604 sum3B = vec_adds(srcM2B, srcP3B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
605 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
606 pp1A = vec_mladd(sum1A, v20ss, sum3A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
607 pp1B = vec_mladd(sum1B, v20ss, sum3B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
608 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
609 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
610 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
611 |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
612 psumA = vec_sub(pp1A, pp2A); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
613 psumB = vec_sub(pp1B, pp2B); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
614 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
615 vec_st(psumA, 0, tmp); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
616 vec_st(psumB, 16, tmp); |
2967 | 617 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
618 src += srcStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
619 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
620 } |
2967 | 621 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
622 tmpM2ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
623 tmpM2ssB = vec_ld(16, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
624 tmpbis += tmpStride; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
625 tmpM1ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
626 tmpM1ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
627 tmpbis += tmpStride; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
628 tmpP0ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
629 tmpP0ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
630 tmpbis += tmpStride; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
631 tmpP1ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
632 tmpP1ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
633 tmpbis += tmpStride; |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
634 tmpP2ssA = vec_ld(0, tmpbis); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
635 tmpP2ssB = vec_ld(16, tmpbis); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
636 tmpbis += tmpStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
637 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
638 for (i = 0 ; i < 16 ; i++) { |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
639 const vector signed short tmpP3ssA = vec_ld(0, tmpbis); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
640 const vector signed short tmpP3ssB = vec_ld(16, tmpbis); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
641 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
642 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
643 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
644 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
645 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
646 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
647 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
648 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
649 tmpbis += tmpStride; |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
650 |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
651 tmpM2ssA = tmpM1ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
652 tmpM2ssB = tmpM1ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
653 tmpM1ssA = tmpP0ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
654 tmpM1ssB = tmpP0ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
655 tmpP0ssA = tmpP1ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
656 tmpP0ssB = tmpP1ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
657 tmpP1ssA = tmpP2ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
658 tmpP1ssB = tmpP2ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
659 tmpP2ssA = tmpP3ssA; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
660 tmpP2ssB = tmpP3ssB; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
661 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
662 pp1Ae = vec_mule(sum1A, v20ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
663 pp1Ao = vec_mulo(sum1A, v20ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
664 pp1Be = vec_mule(sum1B, v20ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
665 pp1Bo = vec_mulo(sum1B, v20ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
666 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
667 pp2Ae = vec_mule(sum2A, v5ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
668 pp2Ao = vec_mulo(sum2A, v5ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
669 pp2Be = vec_mule(sum2B, v5ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
670 pp2Bo = vec_mulo(sum2B, v5ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
671 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
672 pp3Ae = vec_sra((vector signed int)sum3A, v16ui); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
673 pp3Ao = vec_mulo(sum3A, v1ss); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
674 pp3Be = vec_sra((vector signed int)sum3B, v16ui); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
675 pp3Bo = vec_mulo(sum3B, v1ss); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
676 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
677 pp1cAe = vec_add(pp1Ae, v512si); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
678 pp1cAo = vec_add(pp1Ao, v512si); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
679 pp1cBe = vec_add(pp1Be, v512si); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
680 pp1cBo = vec_add(pp1Bo, v512si); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
681 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
682 pp32Ae = vec_sub(pp3Ae, pp2Ae); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
683 pp32Ao = vec_sub(pp3Ao, pp2Ao); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
684 pp32Be = vec_sub(pp3Be, pp2Be); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
685 pp32Bo = vec_sub(pp3Bo, pp2Bo); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
686 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
687 sumAe = vec_add(pp1cAe, pp32Ae); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
688 sumAo = vec_add(pp1cAo, pp32Ao); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
689 sumBe = vec_add(pp1cBe, pp32Be); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
690 sumBo = vec_add(pp1cBo, pp32Bo); |
2967 | 691 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
692 ssumAe = vec_sra(sumAe, v10ui); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
693 ssumAo = vec_sra(sumAo, v10ui); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
694 ssumBe = vec_sra(sumBe, v10ui); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
695 ssumBo = vec_sra(sumBo, v10ui); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
696 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
697 ssume = vec_packs(ssumAe, ssumBe); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
698 ssumo = vec_packs(ssumAo, ssumBo); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
699 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
700 sumv = vec_packsu(ssume, ssumo); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
701 sum = vec_perm(sumv, sumv, mperm); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
702 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
703 dst1 = vec_ld(0, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
704 dst2 = vec_ld(16, dst); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
705 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
706 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
707 OP_U8_ALTIVEC(fsum, sum, vdst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
708 |
3346
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
709 rsum = vec_perm(fsum, fsum, dstperm); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
710 fdst1 = vec_sel(dst1, rsum, dstmask); |
052765f11f1c
Cosmetics: should not hurt performance, scream if are
lu_zero
parents:
3153
diff
changeset
|
711 fdst2 = vec_sel(rsum, dst2, dstmask); |
2236
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
712 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
713 vec_st(fdst1, 0, dst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
714 vec_st(fdst2, 16, dst); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
715 |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
716 dst += dstStride; |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
717 } |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
718 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
b0102ea621dd
h264 qpel mc, size 16 patch by (Romain Dolbeau <dolbeau at caps-entreprise dot com>)
michael
parents:
diff
changeset
|
719 } |