annotate ppc/int_altivec.c @ 12530:63edd10ad4bc libavcodec tip

Try to fix crashes introduced by r25218 r25218 made assumptions about the existence of past reference frames that weren't necessarily true.
author darkshikari
date Tue, 28 Sep 2010 09:06:22 +0000
parents d3551fcf4c1c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
1 /*
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
2 * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
3 *
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
4 * This file is part of FFmpeg.
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
5 *
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
6 * FFmpeg is free software; you can redistribute it and/or
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
8 * License as published by the Free Software Foundation; either
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
9 * version 2.1 of the License, or (at your option) any later version.
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
10 *
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
11 * FFmpeg is distributed in the hope that it will be useful,
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
14 * Lesser General Public License for more details.
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
15 *
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
17 * License along with FFmpeg; if not, write to the Free Software
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
19 */
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
20
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
21 /**
11644
7dd2a45249a9 Remove explicit filename from Doxygen @file commands.
diego
parents: 10644
diff changeset
22 ** @file
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
23 ** integer misc ops.
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
24 **/
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
25
9364
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
26 #include "config.h"
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
27 #if HAVE_ALTIVEC_H
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
28 #include <altivec.h>
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
29 #endif
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
30
9364
7cee7292d5cc Remove unnecessary gcc_fixes.h #include.
diego
parents: 8718
diff changeset
31 #include "libavcodec/dsputil.h"
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
32
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
33 #include "dsputil_altivec.h"
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
34
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
35 #include "types_altivec.h"
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
36
5255
669a97223dc7 make arguments to ssd_int8_vs_int16() const
mru
parents: 5010
diff changeset
37 static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
669a97223dc7 make arguments to ssd_int8_vs_int16() const
mru
parents: 5010
diff changeset
38 int size) {
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
39 int i, size16;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
40 vector signed char vpix1;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
41 vector signed short vpix2, vdiff, vpix1l,vpix1h;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
42 union { vector signed int vscore;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
43 int32_t score[4];
7333
a8a79f5385f6 cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents: 7204
diff changeset
44 } u;
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
45 u.vscore = vec_splat_s32(0);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
46 //
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
47 //XXX lazy way, fix it later
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
48
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
49 #define vec_unaligned_load(b) \
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
50 vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
51
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
52 size16 = size >> 4;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
53 while(size16) {
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
54 // score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
55 //load pix1 and the first batch of pix2
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
56
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
57 vpix1 = vec_unaligned_load(pix1);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
58 vpix2 = vec_unaligned_load(pix2);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
59 pix2 += 8;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
60 //unpack
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
61 vpix1h = vec_unpackh(vpix1);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
62 vdiff = vec_sub(vpix1h, vpix2);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
63 vpix1l = vec_unpackl(vpix1);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
64 // load another batch from pix2
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
65 vpix2 = vec_unaligned_load(pix2);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
66 u.vscore = vec_msum(vdiff, vdiff, u.vscore);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
67 vdiff = vec_sub(vpix1l, vpix2);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
68 u.vscore = vec_msum(vdiff, vdiff, u.vscore);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
69 pix1 += 16;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
70 pix2 += 8;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
71 size16--;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
72 }
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
73 u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
74
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
75 size %= 16;
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
76 for (i = 0; i < size; i++) {
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
77 u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
78 }
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
79 return u.score[3];
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
80 }
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
81
11981
d3551fcf4c1c Add const to some pointer parameters.
cehoyos
parents: 11644
diff changeset
82 static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift)
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
83 {
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
84 int i;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
85 LOAD_ZERO;
8494
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
86 register vec_s16 vec1, *pv;
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
87 register vec_s32 res = vec_splat_s32(0), t;
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
88 register vec_u32 shifts;
10082
9f4b529bd5c0 PPC: remove unnecessary alignment on local variables
mru
parents: 9364
diff changeset
89 int32_t ires;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
90
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
91 shifts = zero_u32v;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
92 if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
93 if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
94 if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
95 if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
96 if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
97
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
98 for(i = 0; i < order; i += 8){
8494
1615d6b75ada Cleanup _t types in libavcodec/ppc
lu_zero
parents: 7333
diff changeset
99 pv = (vec_s16*)v1;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
100 vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
101 t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
102 t = vec_sr(t, shifts);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
103 res = vec_sums(t, res);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
104 v1 += 8;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
105 v2 += 8;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
106 }
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
107 res = vec_splat(res, 3);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
108 vec_ste(res, 0, &ires);
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
109 return ires;
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
110 }
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
111
11981
d3551fcf4c1c Add const to some pointer parameters.
cehoyos
parents: 11644
diff changeset
112 static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
113 {
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
114 LOAD_ZERO;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
115 vec_s16 *pv1 = (vec_s16*)v1;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
116 vec_s16 *pv2 = (vec_s16*)v2;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
117 vec_s16 *pv3 = (vec_s16*)v3;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
118 register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
119 register vec_s16 t0, t1, i0, i1;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
120 register vec_s16 i2 = pv2[0], i3 = pv3[0];
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
121 register vec_s32 res = zero_s32v;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
122 register vec_u8 align = vec_lvsl(0, v2);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
123 int32_t ires;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
124 order >>= 4;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
125 do {
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
126 t0 = vec_perm(i2, pv2[1], align);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
127 i2 = pv2[2];
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
128 t1 = vec_perm(pv2[1], i2, align);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
129 i0 = pv1[0];
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
130 i1 = pv1[1];
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
131 res = vec_msum(t0, i0, res);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
132 res = vec_msum(t1, i1, res);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
133 t0 = vec_perm(i3, pv3[1], align);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
134 i3 = pv3[2];
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
135 t1 = vec_perm(pv3[1], i3, align);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
136 pv1[0] = vec_mladd(t0, muls, i0);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
137 pv1[1] = vec_mladd(t1, muls, i1);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
138 pv1 += 2;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
139 pv2 += 2;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
140 pv3 += 2;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
141 } while(--order);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
142 res = vec_splat(vec_sums(res, zero_s32v), 3);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
143 vec_ste(res, 0, &ires);
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
144 return ires;
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
145 }
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
146
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
147 void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
148 {
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
149 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
7204
83ffe1bed06e Altivec implementation of APE vector functions
kostya
parents: 6763
diff changeset
150 c->scalarproduct_int16 = scalarproduct_int16_altivec;
10644
5da7180afadf refactor and optimize scalarproduct
lorenm
parents: 10082
diff changeset
151 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
4838
eeac11145c4e ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff changeset
152 }