Mercurial > libavcodec.hg
annotate ppc/int_altivec.c @ 9003:b595a8a59967 libavcodec
Change the type of pblocks from pointers to short array into
pointers to array of 64 DCTELEM, similarly to other block fields.
This also get rid of some casts and fixes a warning.
author | iive |
---|---|
date | Sun, 22 Feb 2009 09:02:06 +0000 |
parents | e9d9d946f213 |
children | 7cee7292d5cc |
rev | line source |
---|---|
4838
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
1 /* |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
2 * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
3 * |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
4 * This file is part of FFmpeg. |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
5 * |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
6 * FFmpeg is free software; you can redistribute it and/or |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
7 * modify it under the terms of the GNU Lesser General Public |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
8 * License as published by the Free Software Foundation; either |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
9 * version 2.1 of the License, or (at your option) any later version. |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
10 * |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
11 * FFmpeg is distributed in the hope that it will be useful, |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
14 * Lesser General Public License for more details. |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
15 * |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
16 * You should have received a copy of the GNU Lesser General Public |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
17 * License along with FFmpeg; if not, write to the Free Software |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
19 */ |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
20 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
21 /** |
8718
e9d9d946f213
Use full internal pathname in doxygen @file directives.
diego
parents:
8494
diff
changeset
|
22 ** @file libavcodec/ppc/int_altivec.c |
4838
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
23 ** integer misc ops. |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
24 **/ |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
25 |
6763 | 26 #include "libavcodec/dsputil.h" |
4838
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
27 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
28 #include "gcc_fixes.h" |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
29 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
30 #include "dsputil_altivec.h" |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
31 |
7204 | 32 #include "types_altivec.h" |
33 | |
5255 | 34 static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, |
35 int size) { | |
4838
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
36 int i, size16; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
37 vector signed char vpix1; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
38 vector signed short vpix2, vdiff, vpix1l,vpix1h; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
39 union { vector signed int vscore; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
40 int32_t score[4]; |
7333
a8a79f5385f6
cosmetics: Reformat PPC code in libavcodec according to style guidelines.
diego
parents:
7204
diff
changeset
|
41 } u; |
4838
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
42 u.vscore = vec_splat_s32(0); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
43 // |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
44 //XXX lazy way, fix it later |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
45 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
46 #define vec_unaligned_load(b) \ |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
47 vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b)); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
48 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
49 size16 = size >> 4; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
50 while(size16) { |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
51 // score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
52 //load pix1 and the first batch of pix2 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
53 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
54 vpix1 = vec_unaligned_load(pix1); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
55 vpix2 = vec_unaligned_load(pix2); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
56 pix2 += 8; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
57 //unpack |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
58 vpix1h = vec_unpackh(vpix1); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
59 vdiff = vec_sub(vpix1h, vpix2); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
60 vpix1l = vec_unpackl(vpix1); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
61 // load another batch from pix2 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
62 vpix2 = vec_unaligned_load(pix2); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
63 u.vscore = vec_msum(vdiff, vdiff, u.vscore); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
64 vdiff = vec_sub(vpix1l, vpix2); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
65 u.vscore = vec_msum(vdiff, vdiff, u.vscore); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
66 pix1 += 16; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
67 pix2 += 8; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
68 size16--; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
69 } |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
70 u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
71 |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
72 size %= 16; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
73 for (i = 0; i < size; i++) { |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
74 u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
75 } |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
76 return u.score[3]; |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
77 } |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
78 |
7204 | 79 static void add_int16_altivec(int16_t * v1, int16_t * v2, int order) |
80 { | |
81 int i; | |
8494 | 82 register vec_s16 vec, *pv; |
7204 | 83 |
84 for(i = 0; i < order; i += 8){ | |
8494 | 85 pv = (vec_s16*)v2; |
7204 | 86 vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); |
87 vec_st(vec_add(vec_ld(0, v1), vec), 0, v1); | |
88 v1 += 8; | |
89 v2 += 8; | |
90 } | |
91 } | |
92 | |
93 static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order) | |
94 { | |
95 int i; | |
8494 | 96 register vec_s16 vec, *pv; |
7204 | 97 |
98 for(i = 0; i < order; i += 8){ | |
8494 | 99 pv = (vec_s16*)v2; |
7204 | 100 vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); |
101 vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1); | |
102 v1 += 8; | |
103 v2 += 8; | |
104 } | |
105 } | |
106 | |
107 static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift) | |
108 { | |
109 int i; | |
110 LOAD_ZERO; | |
8494 | 111 register vec_s16 vec1, *pv; |
112 register vec_s32 res = vec_splat_s32(0), t; | |
113 register vec_u32 shifts; | |
7204 | 114 DECLARE_ALIGNED_16(int32_t, ires); |
115 | |
116 shifts = zero_u32v; | |
117 if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1))); | |
118 if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08)); | |
119 if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04)); | |
120 if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02)); | |
121 if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01)); | |
122 | |
123 for(i = 0; i < order; i += 8){ | |
8494 | 124 pv = (vec_s16*)v1; |
7204 | 125 vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); |
126 t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); | |
127 t = vec_sr(t, shifts); | |
128 res = vec_sums(t, res); | |
129 v1 += 8; | |
130 v2 += 8; | |
131 } | |
132 res = vec_splat(res, 3); | |
133 vec_ste(res, 0, &ires); | |
134 return ires; | |
135 } | |
136 | |
4838
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
137 void int_init_altivec(DSPContext* c, AVCodecContext *avctx) |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
138 { |
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
139 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; |
7204 | 140 c->add_int16 = add_int16_altivec; |
141 c->sub_int16 = sub_int16_altivec; | |
142 c->scalarproduct_int16 = scalarproduct_int16_altivec; | |
4838
eeac11145c4e
ssd_int8_vs_int16_altivec, not completely benchmarkedwith svq1
lu_zero
parents:
diff
changeset
|
143 } |