annotate ppc/int_altivec.c @ 10082:9f4b529bd5c0 libavcodec

PPC: remove unnecessary alignment on local variables Storing a single element from a vector where all elements have the same value does not require an aligned destination. Which element is stored depends on the alignment of the destination address, but since they all have the same value, the result is the same regardless of the alignment.
author mru
date Mon, 24 Aug 2009 21:42:22 +0000
parents 7cee7292d5cc
children 5da7180afadf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
1 /*
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
2 * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
3 *
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
4 * This file is part of FFmpeg.
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
5 *
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
10 *
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
14 * Lesser General Public License for more details.
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
15 *
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
19 */
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
20
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
21 /**
8718
e9d9d946f213 Use full internal pathname in doxygen @file directives.
diego
parents: 8494
diff changeset
22 ** @file libavcodec/ppc/int_altivec.c
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
23 ** integer misc ops.
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
24 **/
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
25
9364
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
26 #include "config.h"
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
27 #if HAVE_ALTIVEC_H
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
28 #include <altivec.h>
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
29 #endif
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
30
9364
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
31 #include "libavcodec/dsputil.h"
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
32
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
33 #include "dsputil_altivec.h"
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
34
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
35 #include "types_altivec.h"
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
36
5255
669a97223dc7 make arguments to ssd_int8_vs_int16() const
mru
parents: 5010
diff changeset
37 static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
669a97223dc7 make arguments to ssd_int8_vs_int16() const
mru
parents: 5010
diff changeset
38 int size) {
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
39 int i, size16;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
40 vector signed char vpix1;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
41 vector signed short vpix2, vdiff, vpix1l,vpix1h;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
42 union { vector signed int vscore;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
43 int32_t score[4];
7333
a8a79f5385f6 cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents: 7204
diff changeset
44 } u;
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
45 u.vscore = vec_splat_s32(0);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
46 //
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
47 //XXX lazy way, fix it later
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
48
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
49 #define vec_unaligned_load(b) \
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
50 vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
51
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
52 size16 = size >> 4;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
53 while(size16) {
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
54 // score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
55 //load pix1 and the first batch of pix2
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
56
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
57 vpix1 = vec_unaligned_load(pix1);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
58 vpix2 = vec_unaligned_load(pix2);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
59 pix2 += 8;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
60 //unpack
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
61 vpix1h = vec_unpackh(vpix1);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
62 vdiff = vec_sub(vpix1h, vpix2);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
63 vpix1l = vec_unpackl(vpix1);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
64 // load another batch from pix2
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
65 vpix2 = vec_unaligned_load(pix2);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
66 u.vscore = vec_msum(vdiff, vdiff, u.vscore);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
67 vdiff = vec_sub(vpix1l, vpix2);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
68 u.vscore = vec_msum(vdiff, vdiff, u.vscore);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
69 pix1 += 16;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
70 pix2 += 8;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
71 size16--;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
72 }
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
73 u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
74
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
75 size %= 16;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
76 for (i = 0; i < size; i++) {
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
77 u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
78 }
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
79 return u.score[3];
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
80 }
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
81
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
82 static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
83 {
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
84 int i;
8494
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
85 register vec_s16 vec, *pv;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
86
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
87 for(i = 0; i < order; i += 8){
8494
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
88 pv = (vec_s16*)v2;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
89 vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
90 vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
91 v1 += 8;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
92 v2 += 8;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
93 }
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
94 }
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
95
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
96 static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
97 {
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
98 int i;
8494
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
99 register vec_s16 vec, *pv;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
100
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
101 for(i = 0; i < order; i += 8){
8494
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
102 pv = (vec_s16*)v2;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
103 vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
104 vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
105 v1 += 8;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
106 v2 += 8;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
107 }
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
108 }
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
109
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
110 static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift)
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
111 {
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
112 int i;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
113 LOAD_ZERO;
8494
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
114 register vec_s16 vec1, *pv;
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
115 register vec_s32 res = vec_splat_s32(0), t;
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
116 register vec_u32 shifts;
10082
9f4b529bd5c0 PPC: remove unnecessary alignment on local variables
mru
parents: 9364
diff changeset
117 int32_t ires;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
118
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
119 shifts = zero_u32v;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
120 if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
121 if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
122 if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
123 if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
124 if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
125
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
126 for(i = 0; i < order; i += 8){
8494
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
127 pv = (vec_s16*)v1;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
128 vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
129 t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
130 t = vec_sr(t, shifts);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
131 res = vec_sums(t, res);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
132 v1 += 8;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
133 v2 += 8;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
134 }
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
135 res = vec_splat(res, 3);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
136 vec_ste(res, 0, &ires);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
137 return ires;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
138 }
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
139
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
140 void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
141 {
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
142 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
143 c->add_int16 = add_int16_altivec;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
144 c->sub_int16 = sub_int16_altivec;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
145 c->scalarproduct_int16 = scalarproduct_int16_altivec;
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
146 }