Mercurial > libavcodec.hg
annotate ppc/dsputil_altivec.c @ 1352:e8ff4783f188 libavcodec
1) remove TBL support in PPC performance. It's much more useful to use the
PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless
code around
2) make the PPC perf stuff a configure option
3) make put_pixels16_altivec a bit faster by unrolling the loop by 4
patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author | michaelni |
---|---|
date | Wed, 09 Jul 2003 20:18:13 +0000 |
parents | 09b8fe0f0139 |
children | dea5b2946999 |
rev | line source |
---|---|
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
1 /* |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
2 * Copyright (c) 2002 Brian Foley |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
3 * Copyright (c) 2002 Dieter Shirley |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
5 * |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
6 * This library is free software; you can redistribute it and/or |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
7 * modify it under the terms of the GNU Lesser General Public |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
8 * License as published by the Free Software Foundation; either |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
9 * version 2 of the License, or (at your option) any later version. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
10 * |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
11 * This library is distributed in the hope that it will be useful, |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
14 * Lesser General Public License for more details. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
15 * |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
16 * You should have received a copy of the GNU Lesser General Public |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
17 * License along with this library; if not, write to the Free Software |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
19 */ |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
20 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
21 #include "../dsputil.h" |
1277
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
22 |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
23 #include "gcc_fixes.h" |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
24 |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
25 #include "dsputil_altivec.h" |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
26 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
27 #ifdef CONFIG_DARWIN |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
28 #include <sys/sysctl.h> |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
29 #else /* CONFIG_DARWIN */ |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
30 #include <signal.h> |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
31 #include <setjmp.h> |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
32 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
33 static sigjmp_buf jmpbuf; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
34 static volatile sig_atomic_t canjump = 0; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
35 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
36 static void sigill_handler (int sig) |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
37 { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
38 if (!canjump) { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
39 signal (sig, SIG_DFL); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
40 raise (sig); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
41 } |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
42 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
43 canjump = 0; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
44 siglongjmp (jmpbuf, 1); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
45 } |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
46 #endif /* CONFIG_DARWIN */ |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
47 |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
48 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
49 { |
981 | 50 int i; |
51 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
52 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
981 | 53 vector unsigned char *tv; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
54 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
55 vector unsigned int sad; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
56 vector signed int sumdiffs; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
57 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
58 s = 0; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
59 sad = (vector unsigned int)vec_splat_u32(0); |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
60 for(i=0;i<16;i++) { |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
61 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
62 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
63 pix1v: pix1[0]-pix1[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
64 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
65 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
66 tv = (vector unsigned char *) pix1; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
67 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
68 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
69 tv = (vector unsigned char *) &pix2[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
70 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
71 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
72 tv = (vector unsigned char *) &pix2[1]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
73 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
74 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
75 /* Calculate the average vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
76 avgv = vec_avg(pix2v, pix2iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
77 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
78 /* Calculate a sum of abs differences vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
79 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
80 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
81 /* Add each 4 pixel group together and put 4 results into sad */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
82 sad = vec_sum4s(t5, sad); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
83 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
84 pix1 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
85 pix2 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
86 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
87 /* Sum up the four partial sums, and put the result into s */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
88 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
89 sumdiffs = vec_splat(sumdiffs, 3); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
90 vec_ste(sumdiffs, 0, &s); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
91 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
92 return s; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
93 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
94 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
95 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
96 { |
981 | 97 int i; |
98 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
99 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
981 | 100 vector unsigned char *tv; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
101 vector unsigned char pix1v, pix2v, pix3v, avgv, t5; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
102 vector unsigned int sad; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
103 vector signed int sumdiffs; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
104 uint8_t *pix3 = pix2 + line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
105 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
106 s = 0; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
107 sad = (vector unsigned int)vec_splat_u32(0); |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
108 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
109 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
110 Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
111 iteration becomes pix2 in the next iteration. We can use this |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
112 fact to avoid a potentially expensive unaligned read, each |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
113 time around the loop. |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
114 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
115 pix2v: pix2[0]-pix2[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
116 Split the pixel vectors into shorts |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
117 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
118 tv = (vector unsigned char *) &pix2[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
119 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
120 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
121 for(i=0;i<16;i++) { |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
122 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
123 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
124 pix1v: pix1[0]-pix1[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
125 pix3v: pix3[0]-pix3[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
126 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
127 tv = (vector unsigned char *) pix1; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
128 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
129 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
130 tv = (vector unsigned char *) &pix3[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
131 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
132 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
133 /* Calculate the average vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
134 avgv = vec_avg(pix2v, pix3v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
135 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
136 /* Calculate a sum of abs differences vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
137 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
138 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
139 /* Add each 4 pixel group together and put 4 results into sad */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
140 sad = vec_sum4s(t5, sad); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
141 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
142 pix1 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
143 pix2v = pix3v; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
144 pix3 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
145 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
146 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
147 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
148 /* Sum up the four partial sums, and put the result into s */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
149 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
150 sumdiffs = vec_splat(sumdiffs, 3); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
151 vec_ste(sumdiffs, 0, &s); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
152 return s; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
153 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
154 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
155 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
156 { |
981 | 157 int i; |
158 int s __attribute__((aligned(16))); | |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
159 uint8_t *pix3 = pix2 + line_size; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
160 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
161 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); |
981 | 162 vector unsigned char *tv, avgv, t5; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
163 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
164 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
165 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; |
981 | 166 vector unsigned short avghv, avglv; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
167 vector unsigned short t1, t2, t3, t4; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
168 vector unsigned int sad; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
169 vector signed int sumdiffs; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
170 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
171 sad = (vector unsigned int)vec_splat_u32(0); |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
172 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
173 s = 0; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
174 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
175 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
176 Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
177 iteration becomes pix2 in the next iteration. We can use this |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
178 fact to avoid a potentially expensive unaligned read, as well |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
179 as some splitting, and vector addition each time around the loop. |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
180 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
181 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
182 Split the pixel vectors into shorts |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
183 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
184 tv = (vector unsigned char *) &pix2[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
185 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
186 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
187 tv = (vector unsigned char *) &pix2[1]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
188 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
189 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
190 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
191 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
192 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
193 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
194 t1 = vec_add(pix2hv, pix2ihv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
195 t2 = vec_add(pix2lv, pix2ilv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
196 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
197 for(i=0;i<16;i++) { |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
198 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
199 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
200 pix1v: pix1[0]-pix1[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
201 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
202 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
203 tv = (vector unsigned char *) pix1; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
204 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
205 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
206 tv = (vector unsigned char *) &pix3[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
207 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
208 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
209 tv = (vector unsigned char *) &pix3[1]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
210 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
211 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
212 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
213 Note that Altivec does have vec_avg, but this works on vector pairs |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
214 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
215 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
216 Instead, we have to split the pixel vectors into vectors of shorts, |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
217 and do the averaging by hand. |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
218 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
219 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
220 /* Split the pixel vectors into shorts */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
221 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
222 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
223 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
224 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
225 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
226 /* Do the averaging on them */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
227 t3 = vec_add(pix3hv, pix3ihv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
228 t4 = vec_add(pix3lv, pix3ilv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
229 |
884 | 230 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
231 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); | |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
232 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
233 /* Pack the shorts back into a result */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
234 avgv = vec_pack(avghv, avglv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
235 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
236 /* Calculate a sum of abs differences vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
237 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
238 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
239 /* Add each 4 pixel group together and put 4 results into sad */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
240 sad = vec_sum4s(t5, sad); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
241 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
242 pix1 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
243 pix3 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
244 /* Transfer the calculated values for pix3 into pix2 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
245 t1 = t3; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
246 t2 = t4; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
247 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
248 /* Sum up the four partial sums, and put the result into s */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
249 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
250 sumdiffs = vec_splat(sumdiffs, 3); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
251 vec_ste(sumdiffs, 0, &s); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
252 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
253 return s; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
254 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
255 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
256 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
257 { |
981 | 258 int i; |
259 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
260 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
261 vector unsigned char perm1, perm2, *pix1v, *pix2v; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
262 vector unsigned char t1, t2, t3,t4, t5; |
981 | 263 vector unsigned int sad; |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
264 vector signed int sumdiffs; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
265 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
266 sad = (vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
267 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
268 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
269 for(i=0;i<16;i++) { |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
270 /* Read potentially unaligned pixels into t1 and t2 */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
271 perm1 = vec_lvsl(0, pix1); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
272 pix1v = (vector unsigned char *) pix1; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
273 perm2 = vec_lvsl(0, pix2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
274 pix2v = (vector unsigned char *) pix2; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
275 t1 = vec_perm(pix1v[0], pix1v[1], perm1); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
276 t2 = vec_perm(pix2v[0], pix2v[1], perm2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
277 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
278 /* Calculate a sum of abs differences vector */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
279 t3 = vec_max(t1, t2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
280 t4 = vec_min(t1, t2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
281 t5 = vec_sub(t3, t4); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
282 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
283 /* Add each 4 pixel group together and put 4 results into sad */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
284 sad = vec_sum4s(t5, sad); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
285 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
286 pix1 += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
287 pix2 += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
288 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
289 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
290 /* Sum up the four partial sums, and put the result into s */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
291 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
292 sumdiffs = vec_splat(sumdiffs, 3); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
293 vec_ste(sumdiffs, 0, &s); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
294 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
295 return s; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
296 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
297 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
298 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
299 { |
981 | 300 int i; |
301 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
302 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
303 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
304 vector unsigned char t1, t2, t3,t4, t5; |
981 | 305 vector unsigned int sad; |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
306 vector signed int sumdiffs; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
307 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
308 sad = (vector unsigned int)vec_splat_u32(0); |
1277
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
309 |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
310 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
311 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
312 for(i=0;i<8;i++) { |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
313 /* Read potentially unaligned pixels into t1 and t2 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
314 Since we're reading 16 pixels, and actually only want 8, |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
315 mask out the last 8 pixels. The 0s don't change the sum. */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
316 perm1 = vec_lvsl(0, pix1); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
317 pix1v = (vector unsigned char *) pix1; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
318 perm2 = vec_lvsl(0, pix2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
319 pix2v = (vector unsigned char *) pix2; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
320 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
321 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
322 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
323 /* Calculate a sum of abs differences vector */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
324 t3 = vec_max(t1, t2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
325 t4 = vec_min(t1, t2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
326 t5 = vec_sub(t3, t4); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
327 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
328 /* Add each 4 pixel group together and put 4 results into sad */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
329 sad = vec_sum4s(t5, sad); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
330 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
331 pix1 += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
332 pix2 += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
333 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
334 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
335 /* Sum up the four partial sums, and put the result into s */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
336 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
337 sumdiffs = vec_splat(sumdiffs, 3); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
338 vec_ste(sumdiffs, 0, &s); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
339 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
340 return s; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
341 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
342 |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
343 int pix_norm1_altivec(uint8_t *pix, int line_size) |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
344 { |
981 | 345 int i; |
346 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
347 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
981 | 348 vector unsigned char *tv; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
349 vector unsigned char pixv; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
350 vector unsigned int sv; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
351 vector signed int sum; |
981 | 352 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
353 sv = (vector unsigned int)vec_splat_u32(0); |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
354 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
355 s = 0; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
356 for (i = 0; i < 16; i++) { |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
357 /* Read in the potentially unaligned pixels */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
358 tv = (vector unsigned char *) pix; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
359 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
360 |
884 | 361 /* Square the values, and add them to our sum */ |
362 sv = vec_msum(pixv, pixv, sv); | |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
363 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
364 pix += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
365 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
366 /* Sum up the four partial sums, and put the result into s */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
367 sum = vec_sums((vector signed int) sv, (vector signed int) zero); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
368 sum = vec_splat(sum, 3); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
369 vec_ste(sum, 0, &s); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
370 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
371 return s; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
372 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
373 |
981 | 374 /** |
375 * Sum of Squared Errors for a 8x8 block. | |
376 * AltiVec-enhanced. | |
377 * It's the pix_abs8x8_altivec code above w/ squaring added. | |
378 */ | |
379 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) | |
380 { | |
381 int i; | |
382 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
383 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
981 | 384 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |
385 vector unsigned char t1, t2, t3,t4, t5; | |
386 vector unsigned int sum; | |
387 vector signed int sumsqr; | |
388 | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
389 sum = (vector unsigned int)vec_splat_u32(0); |
1277
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
390 |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
391 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
392 |
981 | 393 |
394 for(i=0;i<8;i++) { | |
395 /* Read potentially unaligned pixels into t1 and t2 | |
396 Since we're reading 16 pixels, and actually only want 8, | |
397 mask out the last 8 pixels. The 0s don't change the sum. */ | |
398 perm1 = vec_lvsl(0, pix1); | |
399 pix1v = (vector unsigned char *) pix1; | |
400 perm2 = vec_lvsl(0, pix2); | |
401 pix2v = (vector unsigned char *) pix2; | |
402 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); | |
403 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); | |
404 | |
405 /* | |
406 Since we want to use unsigned chars, we can take advantage | |
407 of the fact that abs(a-b)^2 = (a-b)^2. | |
408 */ | |
409 | |
410 /* Calculate abs differences vector */ | |
411 t3 = vec_max(t1, t2); | |
412 t4 = vec_min(t1, t2); | |
413 t5 = vec_sub(t3, t4); | |
414 | |
415 /* Square the values and add them to our sum */ | |
416 sum = vec_msum(t5, t5, sum); | |
417 | |
418 pix1 += line_size; | |
419 pix2 += line_size; | |
420 } | |
421 | |
422 /* Sum up the four partial sums, and put the result into s */ | |
423 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
424 sumsqr = vec_splat(sumsqr, 3); | |
425 vec_ste(sumsqr, 0, &s); | |
426 | |
427 return s; | |
428 } | |
429 | |
430 /** | |
431 * Sum of Squared Errors for a 16x16 block. | |
432 * AltiVec-enhanced. | |
433 * It's the pix_abs16x16_altivec code above w/ squaring added. | |
434 */ | |
435 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) | |
436 { | |
437 int i; | |
438 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
439 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
981 | 440 vector unsigned char perm1, perm2, *pix1v, *pix2v; |
441 vector unsigned char t1, t2, t3,t4, t5; | |
442 vector unsigned int sum; | |
443 vector signed int sumsqr; | |
444 | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
445 sum = (vector unsigned int)vec_splat_u32(0); |
981 | 446 |
447 for(i=0;i<16;i++) { | |
448 /* Read potentially unaligned pixels into t1 and t2 */ | |
449 perm1 = vec_lvsl(0, pix1); | |
450 pix1v = (vector unsigned char *) pix1; | |
451 perm2 = vec_lvsl(0, pix2); | |
452 pix2v = (vector unsigned char *) pix2; | |
453 t1 = vec_perm(pix1v[0], pix1v[1], perm1); | |
454 t2 = vec_perm(pix2v[0], pix2v[1], perm2); | |
455 | |
456 /* | |
457 Since we want to use unsigned chars, we can take advantage | |
458 of the fact that abs(a-b)^2 = (a-b)^2. | |
459 */ | |
460 | |
461 /* Calculate abs differences vector */ | |
462 t3 = vec_max(t1, t2); | |
463 t4 = vec_min(t1, t2); | |
464 t5 = vec_sub(t3, t4); | |
465 | |
466 /* Square the values and add them to our sum */ | |
467 sum = vec_msum(t5, t5, sum); | |
468 | |
469 pix1 += line_size; | |
470 pix2 += line_size; | |
471 } | |
472 | |
473 /* Sum up the four partial sums, and put the result into s */ | |
474 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
475 sumsqr = vec_splat(sumsqr, 3); | |
476 vec_ste(sumsqr, 0, &s); | |
477 | |
478 return s; | |
479 } | |
480 | |
1064 | 481 int pix_sum_altivec(uint8_t * pix, int line_size) |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
482 { |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
483 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
484 vector unsigned char perm, *pixv; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
485 vector unsigned char t1; |
981 | 486 vector unsigned int sad; |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
487 vector signed int sumdiffs; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
488 |
981 | 489 int i; |
490 int s __attribute__((aligned(16))); | |
491 | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
492 sad = (vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
493 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
494 for (i = 0; i < 16; i++) { |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
495 /* Read the potentially unaligned 16 pixels into t1 */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
496 perm = vec_lvsl(0, pix); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
497 pixv = (vector unsigned char *) pix; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
498 t1 = vec_perm(pixv[0], pixv[1], perm); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
499 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
500 /* Add each 4 pixel group together and put 4 results into sad */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
501 sad = vec_sum4s(t1, sad); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
502 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
503 pix += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
504 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
505 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
506 /* Sum up the four partial sums, and put the result into s */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
507 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
508 sumdiffs = vec_splat(sumdiffs, 3); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
509 vec_ste(sumdiffs, 0, &s); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
510 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
511 return s; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
512 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
513 |
1064 | 514 void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
515 { |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
516 int i; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
517 vector unsigned char perm, bytes, *pixv; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
518 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
519 vector signed short shorts; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
520 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
521 for(i=0;i<8;i++) |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
522 { |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
523 // Read potentially unaligned pixels. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
524 // We're reading 16 pixels, and actually only want 8, |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
525 // but we simply ignore the extras. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
526 perm = vec_lvsl(0, pixels); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
527 pixv = (vector unsigned char *) pixels; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
528 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
529 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
530 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
531 shorts = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
532 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
533 // save the data to the block, we assume the block is 16-byte aligned |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
534 vec_st(shorts, i*16, (vector signed short*)block); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
535 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
536 pixels += line_size; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
537 } |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
538 } |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
539 |
1064 | 540 void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, |
541 const uint8_t *s2, int stride) | |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
542 { |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
543 int i; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
544 vector unsigned char perm, bytes, *pixv; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
545 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
546 vector signed short shorts1, shorts2; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
547 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
548 for(i=0;i<4;i++) |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
549 { |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
550 // Read potentially unaligned pixels |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
551 // We're reading 16 pixels, and actually only want 8, |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
552 // but we simply ignore the extras. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
553 perm = vec_lvsl(0, s1); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
554 pixv = (vector unsigned char *) s1; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
555 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
556 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
557 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
558 shorts1 = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
559 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
560 // Do the same for the second block of pixels |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
561 perm = vec_lvsl(0, s2); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
562 pixv = (vector unsigned char *) s2; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
563 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
564 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
565 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
566 shorts2 = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
567 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
568 // Do the subtraction |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
569 shorts1 = vec_sub(shorts1, shorts2); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
570 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
571 // save the data to the block, we assume the block is 16-byte aligned |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
572 vec_st(shorts1, 0, (vector signed short*)block); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
573 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
574 s1 += stride; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
575 s2 += stride; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
576 block += 8; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
577 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
578 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
579 // The code below is a copy of the code above... This is a manual |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
580 // unroll. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
581 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
582 // Read potentially unaligned pixels |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
583 // We're reading 16 pixels, and actually only want 8, |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
584 // but we simply ignore the extras. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
585 perm = vec_lvsl(0, s1); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
586 pixv = (vector unsigned char *) s1; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
587 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
588 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
589 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
590 shorts1 = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
591 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
592 // Do the same for the second block of pixels |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
593 perm = vec_lvsl(0, s2); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
594 pixv = (vector unsigned char *) s2; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
595 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
596 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
597 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
598 shorts2 = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
599 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
600 // Do the subtraction |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
601 shorts1 = vec_sub(shorts1, shorts2); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
602 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
603 // save the data to the block, we assume the block is 16-byte aligned |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
604 vec_st(shorts1, 0, (vector signed short*)block); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
605 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
606 s1 += stride; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
607 s2 += stride; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
608 block += 8; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
609 } |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
610 } |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
611 |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
612 int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
613 return pix_abs16x16_altivec(a,b,stride); |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
614 } |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
615 |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
616 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
617 return pix_abs8x8_altivec(a,b,stride); |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
618 } |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
619 |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
620 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
621 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
622 int i; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
623 for(i=0; i+7<w; i++){ |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
624 dst[i+0] += src[i+0]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
625 dst[i+1] += src[i+1]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
626 dst[i+2] += src[i+2]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
627 dst[i+3] += src[i+3]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
628 dst[i+4] += src[i+4]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
629 dst[i+5] += src[i+5]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
630 dst[i+6] += src[i+6]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
631 dst[i+7] += src[i+7]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
632 } |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
633 for(; i<w; i++) |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
634 dst[i+0] += src[i+0]; |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
635 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
636 register int i; |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
637 register vector unsigned char vdst, vsrc; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
638 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
639 /* dst and src are 16 bytes-aligned (guaranteed) */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
640 for(i = 0 ; (i + 15) < w ; i++) |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
641 { |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
642 vdst = vec_ld(i << 4, (unsigned char*)dst); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
643 vsrc = vec_ld(i << 4, (unsigned char*)src); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
644 vdst = vec_add(vsrc, vdst); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
645 vec_st(vdst, i << 4, (unsigned char*)dst); |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
646 } |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
647 /* if w is not a multiple of 16 */ |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
648 for (; (i < w) ; i++) |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
649 { |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
650 dst[i] = src[i]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
651 } |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
652 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
653 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
654 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
655 /* next one assumes that ((line_size % 16) == 0) */ |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
656 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
657 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
658 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
659 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
660 int i; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
661 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
662 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
663 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
664 for(i=0; i<h; i++) { |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
665 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
666 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
667 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
668 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
669 pixels+=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
670 block +=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
671 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
672 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
673 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
674 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
675 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
676 register vector unsigned char pixelsv1, pixelsv2; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
677 register vector unsigned char pixelsv1B, pixelsv2B; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
678 register vector unsigned char pixelsv1C, pixelsv2C; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
679 register vector unsigned char pixelsv1D, pixelsv2D; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
680 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
681 register vector unsigned char perm = vec_lvsl(0, pixels); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
682 int i; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
683 register int line_size_2 = line_size << 1; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
684 register int line_size_3 = line_size + line_size_2; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
685 register int line_size_4 = line_size << 2; |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
686 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
687 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
688 // hand-unrolling the loop by 4 gains about 15% |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
689 // mininum execution time goes from 74 to 60 cycles |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
690 // it's faster than -funroll-loops, but using |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
691 // -funroll-loops w/ this is bad - 74 cycles again. |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
692 // all this is on a 7450, tuning for the 7450 |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
693 #if 0 |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
694 for(i=0; i<h; i++) { |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
695 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
696 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
697 vec_st(vec_perm(pixelsv1, pixelsv2, perm), |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
698 0, (unsigned char*)block); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
699 pixels+=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
700 block +=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
701 } |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
702 #else |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
703 for(i=0; i<h; i+=4) { |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
704 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
705 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
706 pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
707 pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
708 pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
709 pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
710 pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
711 pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
712 vec_st(vec_perm(pixelsv1, pixelsv2, perm), |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
713 0, (unsigned char*)block); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
714 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
715 line_size, (unsigned char*)block); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
716 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
717 line_size_2, (unsigned char*)block); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
718 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
719 line_size_3, (unsigned char*)block); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
720 pixels+=line_size_4; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
721 block +=line_size_4; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
722 } |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
723 #endif |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
724 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
725 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
726 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
727 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
728 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
729 /* next one assumes that ((line_size % 16) == 0) */ |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
730 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
731 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
732 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
733 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
734 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
735 int i; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
736 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
737 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
738 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
739 for(i=0; i<h; i++) { |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
740 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
741 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
742 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
743 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
744 pixels+=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
745 block +=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
746 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
747 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
748 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
749 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
750 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
751 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
752 register vector unsigned char perm = vec_lvsl(0, pixels); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
753 int i; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
754 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
755 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
756 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
757 for(i=0; i<h; i++) { |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
758 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
759 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
760 blockv = vec_ld(0, block); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
761 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
762 blockv = vec_avg(blockv,pixelsv); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
763 vec_st(blockv, 0, (unsigned char*)block); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
764 pixels+=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
765 block +=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
766 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
767 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
768 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
769 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
770 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
771 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
772 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
773 /* next one assumes that ((line_size % 8) == 0) */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
774 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
775 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
776 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
777 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
778 int i; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
779 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
780 for (i = 0; i < h; i++) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
781 *((uint32_t *) (block)) = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
782 (((*((uint32_t *) (block))) | |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
783 ((((const struct unaligned_32 *) (pixels))->l))) - |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
784 ((((*((uint32_t *) (block))) ^ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
785 ((((const struct unaligned_32 *) (pixels))-> |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
786 l))) & 0xFEFEFEFEUL) >> 1)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
787 *((uint32_t *) (block + 4)) = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
788 (((*((uint32_t *) (block + 4))) | |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
789 ((((const struct unaligned_32 *) (pixels + 4))->l))) - |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
790 ((((*((uint32_t *) (block + 4))) ^ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
791 ((((const struct unaligned_32 *) (pixels + |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
792 4))-> |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
793 l))) & 0xFEFEFEFEUL) >> 1)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
794 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
795 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
796 } |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
797 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
798 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
799 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
800 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
801 int i; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
802 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
803 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
804 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
805 for (i = 0; i < h; i++) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
806 /* |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
807 block is 8 bytes-aligned, so we're either in the |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
808 left block (16 bytes-aligned) or in the right block (not) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
809 */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
810 int rightside = ((unsigned long)block & 0x0000000F); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
811 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
812 blockv = vec_ld(0, block); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
813 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
814 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
815 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
816 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
817 if (rightside) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
818 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
819 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
820 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
821 else |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
822 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
823 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
824 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
825 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
826 blockv = vec_avg(blockv, pixelsv); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
827 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
828 vec_st(blockv, 0, block); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
829 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
830 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
831 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
832 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
833 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
834 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
835 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
836 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
837 } |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
838 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
839 /* next one assumes that ((line_size % 8) == 0) */ |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
840 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
841 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
842 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
843 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
844 int j; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
845 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
846 for (j = 0; j < 2; j++) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
847 int i; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
848 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
849 const uint32_t b = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
850 (((const struct unaligned_32 *) (pixels + 1))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
851 uint32_t l0 = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
852 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
853 uint32_t h0 = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
854 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
855 uint32_t l1, h1; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
856 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
857 for (i = 0; i < h; i += 2) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
858 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
859 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
860 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
861 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
862 *((uint32_t *) block) = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
863 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
864 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
865 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
866 a = (((const struct unaligned_32 *) (pixels))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
867 b = (((const struct unaligned_32 *) (pixels + 1))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
868 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
869 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
870 *((uint32_t *) block) = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
871 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
872 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
873 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
874 } pixels += 4 - line_size * (h + 1); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
875 block += 4 - line_size * h; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
876 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
877 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
878 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
879 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
880 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
881 register int i; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
882 register vector unsigned char |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
883 pixelsv1, pixelsv2, |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
884 pixelsavg; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
885 register vector unsigned char |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
886 blockv, temp1, temp2; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
887 register vector unsigned short |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
888 pixelssum1, pixelssum2, temp3; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
889 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
890 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
891 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
892 temp1 = vec_ld(0, pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
893 temp2 = vec_ld(16, pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
894 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
895 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
896 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
897 pixelsv2 = temp2; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
898 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
899 else |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
900 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
901 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
902 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
903 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
904 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
905 pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
906 (vector unsigned short)pixelsv2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
907 pixelssum1 = vec_add(pixelssum1, vctwo); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
908 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
909 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
910 for (i = 0; i < h ; i++) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
911 int rightside = ((unsigned long)block & 0x0000000F); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
912 blockv = vec_ld(0, block); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
913 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
914 temp1 = vec_ld(line_size, pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
915 temp2 = vec_ld(line_size + 16, pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
916 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
917 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
918 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
919 pixelsv2 = temp2; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
920 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
921 else |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
922 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
923 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
924 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
925 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
926 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
927 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
928 pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
929 (vector unsigned short)pixelsv2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
930 temp3 = vec_add(pixelssum1, pixelssum2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
931 temp3 = vec_sra(temp3, vctwo); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
932 pixelssum1 = vec_add(pixelssum2, vctwo); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
933 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
934 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
935 if (rightside) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
936 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
937 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
938 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
939 else |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
940 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
941 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
942 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
943 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
944 vec_st(blockv, 0, block); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
945 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
946 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
947 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
948 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
949 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
950 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
951 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
952 } |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
953 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
954 /* next one assumes that ((line_size % 8) == 0) */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
955 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
956 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
957 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
958 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
959 int j; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
960 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
961 for (j = 0; j < 2; j++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
962 int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
963 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
964 const uint32_t b = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
965 (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
966 uint32_t l0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
967 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
968 uint32_t h0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
969 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
970 uint32_t l1, h1; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
971 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
972 for (i = 0; i < h; i += 2) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
973 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
974 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
975 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
976 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
977 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
978 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
979 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
980 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
981 a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
982 b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
983 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
984 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
985 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
986 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
987 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
988 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
989 } pixels += 4 - line_size * (h + 1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
990 block += 4 - line_size * h; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
991 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
992 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
993 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
994 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
995 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
996 register int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
997 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
998 pixelsv1, pixelsv2, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
999 pixelsavg; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1000 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1001 blockv, temp1, temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1002 register vector unsigned short |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1003 pixelssum1, pixelssum2, temp3; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1004 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1005 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1006 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1007 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1008 temp1 = vec_ld(0, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1009 temp2 = vec_ld(16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1010 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1011 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1012 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1013 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1014 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1015 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1016 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1017 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1018 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1019 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1020 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1021 pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1022 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1023 pixelssum1 = vec_add(pixelssum1, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1024 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1025 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1026 for (i = 0; i < h ; i++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1027 int rightside = ((unsigned long)block & 0x0000000F); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1028 blockv = vec_ld(0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1029 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1030 temp1 = vec_ld(line_size, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1031 temp2 = vec_ld(line_size + 16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1032 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1033 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1034 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1035 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1036 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1037 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1038 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1039 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1040 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1041 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1042 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1043 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1044 pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1045 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1046 temp3 = vec_add(pixelssum1, pixelssum2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1047 temp3 = vec_sra(temp3, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1048 pixelssum1 = vec_add(pixelssum2, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1049 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1050 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1051 if (rightside) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1052 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1053 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1054 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1055 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1056 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1057 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1058 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1059 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1060 vec_st(blockv, 0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1061 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1062 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1063 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1064 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1065 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1066 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1067 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1068 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1069 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1070 /* next one assumes that ((line_size % 16) == 0) */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1071 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1072 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1073 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1074 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1075 int j; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1076 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1077 for (j = 0; j < 4; j++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1078 int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1079 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1080 const uint32_t b = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1081 (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1082 uint32_t l0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1083 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1084 uint32_t h0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1085 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1086 uint32_t l1, h1; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1087 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1088 for (i = 0; i < h; i += 2) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1089 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1090 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1091 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1092 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1093 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1094 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1095 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1096 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1097 a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1098 b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1099 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1100 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1101 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1102 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1103 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1104 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1105 } pixels += 4 - line_size * (h + 1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1106 block += 4 - line_size * h; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1107 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1108 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1109 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1110 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1111 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1112 register int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1113 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1114 pixelsv1, pixelsv2, pixelsv3, pixelsv4; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1115 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1116 blockv, temp1, temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1117 register vector unsigned short |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1118 pixelssum1, pixelssum2, temp3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1119 pixelssum3, pixelssum4, temp4; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1120 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1121 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
1340
09b8fe0f0139
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1277
diff
changeset
|
1122 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1123 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); |
1340
09b8fe0f0139
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1277
diff
changeset
|
1124 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1125 temp1 = vec_ld(0, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1126 temp2 = vec_ld(16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1127 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1128 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1129 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1130 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1131 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1132 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1133 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1134 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1135 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1136 pixelsv3 = vec_mergel(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1137 pixelsv4 = vec_mergel(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1138 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1139 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1140 pixelssum3 = vec_add((vector unsigned short)pixelsv3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1141 (vector unsigned short)pixelsv4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1142 pixelssum3 = vec_add(pixelssum3, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1143 pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1144 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1145 pixelssum1 = vec_add(pixelssum1, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1146 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1147 for (i = 0; i < h ; i++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1148 blockv = vec_ld(0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1149 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1150 temp1 = vec_ld(line_size, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1151 temp2 = vec_ld(line_size + 16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1152 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1153 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1154 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1155 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1156 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1157 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1158 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1159 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1160 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1161 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1162 pixelsv3 = vec_mergel(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1163 pixelsv4 = vec_mergel(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1164 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1165 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1166 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1167 pixelssum4 = vec_add((vector unsigned short)pixelsv3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1168 (vector unsigned short)pixelsv4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1169 pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1170 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1171 temp4 = vec_add(pixelssum3, pixelssum4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1172 temp4 = vec_sra(temp4, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1173 temp3 = vec_add(pixelssum1, pixelssum2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1174 temp3 = vec_sra(temp3, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1175 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1176 pixelssum3 = vec_add(pixelssum4, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1177 pixelssum1 = vec_add(pixelssum2, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1178 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1179 blockv = vec_packsu(temp3, temp4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1180 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1181 vec_st(blockv, 0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1182 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1183 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1184 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1185 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1186 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1187 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1188 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1189 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1190 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1191 /* next one assumes that ((line_size % 16) == 0) */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1192 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1193 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1194 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1195 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1196 int j; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1197 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1198 for (j = 0; j < 4; j++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1199 int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1200 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1201 const uint32_t b = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1202 (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1203 uint32_t l0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1204 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1205 uint32_t h0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1206 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1207 uint32_t l1, h1; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1208 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1209 for (i = 0; i < h; i += 2) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1210 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1211 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1212 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1213 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1214 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1215 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1216 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1217 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1218 a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1219 b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1220 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1221 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1222 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1223 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1224 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1225 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1226 } pixels += 4 - line_size * (h + 1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1227 block += 4 - line_size * h; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1228 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1229 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1230 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1231 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1232 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1233 register int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1234 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1235 pixelsv1, pixelsv2, pixelsv3, pixelsv4; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1236 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1237 blockv, temp1, temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1238 register vector unsigned short |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1239 pixelssum1, pixelssum2, temp3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1240 pixelssum3, pixelssum4, temp4; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1241 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1242 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1243 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
1340
09b8fe0f0139
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1277
diff
changeset
|
1244 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1245 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1340
09b8fe0f0139
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1277
diff
changeset
|
1246 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1247 temp1 = vec_ld(0, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1248 temp2 = vec_ld(16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1249 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1250 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1251 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1252 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1253 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1254 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1255 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1256 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1257 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1258 pixelsv3 = vec_mergel(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1259 pixelsv4 = vec_mergel(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1260 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1261 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1262 pixelssum3 = vec_add((vector unsigned short)pixelsv3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1263 (vector unsigned short)pixelsv4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1264 pixelssum3 = vec_add(pixelssum3, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1265 pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1266 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1267 pixelssum1 = vec_add(pixelssum1, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1268 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1269 for (i = 0; i < h ; i++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1270 blockv = vec_ld(0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1271 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1272 temp1 = vec_ld(line_size, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1273 temp2 = vec_ld(line_size + 16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1274 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1275 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1276 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1277 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1278 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1279 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1280 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1281 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1282 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1283 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1284 pixelsv3 = vec_mergel(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1285 pixelsv4 = vec_mergel(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1286 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1287 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1288 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1289 pixelssum4 = vec_add((vector unsigned short)pixelsv3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1290 (vector unsigned short)pixelsv4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1291 pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1292 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1293 temp4 = vec_add(pixelssum3, pixelssum4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1294 temp4 = vec_sra(temp4, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1295 temp3 = vec_add(pixelssum1, pixelssum2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1296 temp3 = vec_sra(temp3, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1297 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1298 pixelssum3 = vec_add(pixelssum4, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1299 pixelssum1 = vec_add(pixelssum2, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1300 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1301 blockv = vec_packsu(temp3, temp4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1302 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1303 vec_st(blockv, 0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1304 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1305 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1306 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1307 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1308 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1309 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1310 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1311 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1312 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1313 int has_altivec(void) |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1314 { |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1315 #ifdef CONFIG_DARWIN |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1316 int sels[2] = {CTL_HW, HW_VECTORUNIT}; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1317 int has_vu = 0; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1318 size_t len = sizeof(has_vu); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1319 int err; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1320 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1321 err = sysctl(sels, 2, &has_vu, &len, NULL, 0); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1322 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1323 if (err == 0) return (has_vu != 0); |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1324 #else /* CONFIG_DARWIN */ |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1325 /* no Darwin, do it the brute-force way */ |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1326 /* this is borrowed from the libmpeg2 library */ |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1327 { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1328 signal (SIGILL, sigill_handler); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1329 if (sigsetjmp (jmpbuf, 1)) { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1330 signal (SIGILL, SIG_DFL); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1331 } else { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1332 canjump = 1; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1333 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1334 asm volatile ("mtspr 256, %0\n\t" |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1335 "vand %%v0, %%v0, %%v0" |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1336 : |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1337 : "r" (-1)); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1338 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1339 signal (SIGILL, SIG_DFL); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1340 return 1; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1341 } |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1342 } |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1343 #endif /* CONFIG_DARWIN */ |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1344 return 0; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1345 } |