annotate ppc/dsputil_altivec.c @ 1352:e8ff4783f188 libavcodec

1) remove TBL support in PPC performance. It's much more useful to use the PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless code around 2) make the PPC perf stuff a configure option 3) make put_pixels16_altivec a bit faster by unrolling the loop by 4 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michaelni
date Wed, 09 Jul 2003 20:18:13 +0000
parents 09b8fe0f0139
children dea5b2946999
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
828
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
1 /*
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
2 * Copyright (c) 2002 Brian Foley
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
3 * Copyright (c) 2002 Dieter Shirley
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
828
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
5 *
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
6 * This library is free software; you can redistribute it and/or
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
7 * modify it under the terms of the GNU Lesser General Public
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
8 * License as published by the Free Software Foundation; either
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
9 * version 2 of the License, or (at your option) any later version.
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
10 *
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
11 * This library is distributed in the hope that it will be useful,
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
14 * Lesser General Public License for more details.
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
15 *
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
16 * You should have received a copy of the GNU Lesser General Public
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
17 * License along with this library; if not, write to the Free Software
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
19 */
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
20
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
21 #include "../dsputil.h"
1277
f3152eb76f1a altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents: 1064
diff changeset
22
f3152eb76f1a altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents: 1064
diff changeset
23 #include "gcc_fixes.h"
f3152eb76f1a altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents: 1064
diff changeset
24
828
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
25 #include "dsputil_altivec.h"
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
26
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
27 #ifdef CONFIG_DARWIN
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
28 #include <sys/sysctl.h>
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
29 #else /* CONFIG_DARWIN */
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
30 #include <signal.h>
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
31 #include <setjmp.h>
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
32
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
33 static sigjmp_buf jmpbuf;
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
34 static volatile sig_atomic_t canjump = 0;
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
35
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
36 static void sigill_handler (int sig)
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
37 {
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
38 if (!canjump) {
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
39 signal (sig, SIG_DFL);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
40 raise (sig);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
41 }
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
42
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
43 canjump = 0;
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
44 siglongjmp (jmpbuf, 1);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
45 }
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
46 #endif /* CONFIG_DARWIN */
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
47
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
48 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
49 {
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
50 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
51 int s __attribute__((aligned(16)));
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
52 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
53 vector unsigned char *tv;
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
54 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
55 vector unsigned int sad;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
56 vector signed int sumdiffs;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
57
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
58 s = 0;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
59 sad = (vector unsigned int)vec_splat_u32(0);
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
60 for(i=0;i<16;i++) {
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
61 /*
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
62 Read unaligned pixels into our vectors. The vectors are as follows:
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
63 pix1v: pix1[0]-pix1[15]
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
64 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
65 */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
66 tv = (vector unsigned char *) pix1;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
67 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
68
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
69 tv = (vector unsigned char *) &pix2[0];
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
70 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
71
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
72 tv = (vector unsigned char *) &pix2[1];
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
73 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
74
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
75 /* Calculate the average vector */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
76 avgv = vec_avg(pix2v, pix2iv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
77
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
78 /* Calculate a sum of abs differences vector */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
79 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
80
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
81 /* Add each 4 pixel group together and put 4 results into sad */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
82 sad = vec_sum4s(t5, sad);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
83
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
84 pix1 += line_size;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
85 pix2 += line_size;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
86 }
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
87 /* Sum up the four partial sums, and put the result into s */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
88 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
89 sumdiffs = vec_splat(sumdiffs, 3);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
90 vec_ste(sumdiffs, 0, &s);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
91
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
92 return s;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
93 }
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
94
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
95 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
96 {
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
97 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
98 int s __attribute__((aligned(16)));
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
99 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
100 vector unsigned char *tv;
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
101 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
102 vector unsigned int sad;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
103 vector signed int sumdiffs;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
104 uint8_t *pix3 = pix2 + line_size;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
105
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
106 s = 0;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
107 sad = (vector unsigned int)vec_splat_u32(0);
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
108
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
109 /*
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
110 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
111 iteration becomes pix2 in the next iteration. We can use this
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
112 fact to avoid a potentially expensive unaligned read, each
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
113 time around the loop.
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
114 Read unaligned pixels into our vectors. The vectors are as follows:
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
115 pix2v: pix2[0]-pix2[15]
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
116 Split the pixel vectors into shorts
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
117 */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
118 tv = (vector unsigned char *) &pix2[0];
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
119 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
120
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
121 for(i=0;i<16;i++) {
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
122 /*
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
123 Read unaligned pixels into our vectors. The vectors are as follows:
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
124 pix1v: pix1[0]-pix1[15]
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
125 pix3v: pix3[0]-pix3[15]
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
126 */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
127 tv = (vector unsigned char *) pix1;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
128 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
129
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
130 tv = (vector unsigned char *) &pix3[0];
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
131 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
132
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
133 /* Calculate the average vector */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
134 avgv = vec_avg(pix2v, pix3v);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
135
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
136 /* Calculate a sum of abs differences vector */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
137 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
138
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
139 /* Add each 4 pixel group together and put 4 results into sad */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
140 sad = vec_sum4s(t5, sad);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
141
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
142 pix1 += line_size;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
143 pix2v = pix3v;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
144 pix3 += line_size;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
145
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
146 }
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
147
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
148 /* Sum up the four partial sums, and put the result into s */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
149 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
150 sumdiffs = vec_splat(sumdiffs, 3);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
151 vec_ste(sumdiffs, 0, &s);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
152 return s;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
153 }
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
154
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
155 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
156 {
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
157 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
158 int s __attribute__((aligned(16)));
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
159 uint8_t *pix3 = pix2 + line_size;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
160 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
161 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
162 vector unsigned char *tv, avgv, t5;
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
163 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
164 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
165 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
166 vector unsigned short avghv, avglv;
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
167 vector unsigned short t1, t2, t3, t4;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
168 vector unsigned int sad;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
169 vector signed int sumdiffs;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
170
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
171 sad = (vector unsigned int)vec_splat_u32(0);
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
172
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
173 s = 0;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
174
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
175 /*
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
176 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
177 iteration becomes pix2 in the next iteration. We can use this
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
178 fact to avoid a potentially expensive unaligned read, as well
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
179 as some splitting, and vector addition each time around the loop.
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
180 Read unaligned pixels into our vectors. The vectors are as follows:
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
181 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
182 Split the pixel vectors into shorts
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
183 */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
184 tv = (vector unsigned char *) &pix2[0];
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
185 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
186
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
187 tv = (vector unsigned char *) &pix2[1];
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
188 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
189
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
190 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
191 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
192 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
193 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
194 t1 = vec_add(pix2hv, pix2ihv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
195 t2 = vec_add(pix2lv, pix2ilv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
196
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
197 for(i=0;i<16;i++) {
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
198 /*
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
199 Read unaligned pixels into our vectors. The vectors are as follows:
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
200 pix1v: pix1[0]-pix1[15]
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
201 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
202 */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
203 tv = (vector unsigned char *) pix1;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
204 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
205
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
206 tv = (vector unsigned char *) &pix3[0];
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
207 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
208
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
209 tv = (vector unsigned char *) &pix3[1];
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
210 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
211
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
212 /*
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
213 Note that Altivec does have vec_avg, but this works on vector pairs
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
214 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
215 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
216 Instead, we have to split the pixel vectors into vectors of shorts,
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
217 and do the averaging by hand.
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
218 */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
219
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
220 /* Split the pixel vectors into shorts */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
221 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
222 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
223 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
224 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
225
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
226 /* Do the averaging on them */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
227 t3 = vec_add(pix3hv, pix3ihv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
228 t4 = vec_add(pix3lv, pix3ilv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
229
884
2cef5c4c0ca6 * altivec and pix_norm patch by Brian Foley
kabi
parents: 878
diff changeset
230 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
2cef5c4c0ca6 * altivec and pix_norm patch by Brian Foley
kabi
parents: 878
diff changeset
231 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
232
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
233 /* Pack the shorts back into a result */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
234 avgv = vec_pack(avghv, avglv);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
235
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
236 /* Calculate a sum of abs differences vector */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
237 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
238
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
239 /* Add each 4 pixel group together and put 4 results into sad */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
240 sad = vec_sum4s(t5, sad);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
241
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
242 pix1 += line_size;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
243 pix3 += line_size;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
244 /* Transfer the calculated values for pix3 into pix2 */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
245 t1 = t3;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
246 t2 = t4;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
247 }
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
248 /* Sum up the four partial sums, and put the result into s */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
249 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
250 sumdiffs = vec_splat(sumdiffs, 3);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
251 vec_ste(sumdiffs, 0, &s);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
252
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
253 return s;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
254 }
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
255
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
256 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
257 {
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
258 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
259 int s __attribute__((aligned(16)));
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
260 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
261 vector unsigned char perm1, perm2, *pix1v, *pix2v;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
262 vector unsigned char t1, t2, t3,t4, t5;
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
263 vector unsigned int sad;
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
264 vector signed int sumdiffs;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
265
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
266 sad = (vector unsigned int)vec_splat_u32(0);
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
267
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
268
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
269 for(i=0;i<16;i++) {
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
270 /* Read potentially unaligned pixels into t1 and t2 */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
271 perm1 = vec_lvsl(0, pix1);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
272 pix1v = (vector unsigned char *) pix1;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
273 perm2 = vec_lvsl(0, pix2);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
274 pix2v = (vector unsigned char *) pix2;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
275 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
276 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
277
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
278 /* Calculate a sum of abs differences vector */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
279 t3 = vec_max(t1, t2);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
280 t4 = vec_min(t1, t2);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
281 t5 = vec_sub(t3, t4);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
282
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
283 /* Add each 4 pixel group together and put 4 results into sad */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
284 sad = vec_sum4s(t5, sad);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
285
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
286 pix1 += line_size;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
287 pix2 += line_size;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
288 }
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
289
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
290 /* Sum up the four partial sums, and put the result into s */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
291 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
292 sumdiffs = vec_splat(sumdiffs, 3);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
293 vec_ste(sumdiffs, 0, &s);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
294
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
295 return s;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
296 }
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
297
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
298 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
299 {
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
300 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
301 int s __attribute__((aligned(16)));
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
302 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
303 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
304 vector unsigned char t1, t2, t3,t4, t5;
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
305 vector unsigned int sad;
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
306 vector signed int sumdiffs;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
307
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
308 sad = (vector unsigned int)vec_splat_u32(0);
1277
f3152eb76f1a altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents: 1064
diff changeset
309
f3152eb76f1a altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents: 1064
diff changeset
310 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
311
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
312 for(i=0;i<8;i++) {
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
313 /* Read potentially unaligned pixels into t1 and t2
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
314 Since we're reading 16 pixels, and actually only want 8,
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
315 mask out the last 8 pixels. The 0s don't change the sum. */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
316 perm1 = vec_lvsl(0, pix1);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
317 pix1v = (vector unsigned char *) pix1;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
318 perm2 = vec_lvsl(0, pix2);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
319 pix2v = (vector unsigned char *) pix2;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
320 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
321 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
322
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
323 /* Calculate a sum of abs differences vector */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
324 t3 = vec_max(t1, t2);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
325 t4 = vec_min(t1, t2);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
326 t5 = vec_sub(t3, t4);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
327
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
328 /* Add each 4 pixel group together and put 4 results into sad */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
329 sad = vec_sum4s(t5, sad);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
330
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
331 pix1 += line_size;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
332 pix2 += line_size;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
333 }
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
334
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
335 /* Sum up the four partial sums, and put the result into s */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
336 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
337 sumdiffs = vec_splat(sumdiffs, 3);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
338 vec_ste(sumdiffs, 0, &s);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
339
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
340 return s;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
341 }
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
342
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
343 int pix_norm1_altivec(uint8_t *pix, int line_size)
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
344 {
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
345 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
346 int s __attribute__((aligned(16)));
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
347 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
348 vector unsigned char *tv;
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
349 vector unsigned char pixv;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
350 vector unsigned int sv;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
351 vector signed int sum;
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
352
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
353 sv = (vector unsigned int)vec_splat_u32(0);
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
354
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
355 s = 0;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
356 for (i = 0; i < 16; i++) {
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
357 /* Read in the potentially unaligned pixels */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
358 tv = (vector unsigned char *) pix;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
359 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
360
884
2cef5c4c0ca6 * altivec and pix_norm patch by Brian Foley
kabi
parents: 878
diff changeset
361 /* Square the values, and add them to our sum */
2cef5c4c0ca6 * altivec and pix_norm patch by Brian Foley
kabi
parents: 878
diff changeset
362 sv = vec_msum(pixv, pixv, sv);
878
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
363
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
364 pix += line_size;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
365 }
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
366 /* Sum up the four partial sums, and put the result into s */
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
367 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
368 sum = vec_splat(sum, 3);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
369 vec_ste(sum, 0, &s);
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
370
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
371 return s;
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
372 }
6ea69518e5f7 altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents: 828
diff changeset
373
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
374 /**
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
375 * Sum of Squared Errors for a 8x8 block.
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
376 * AltiVec-enhanced.
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
377 * It's the pix_abs8x8_altivec code above w/ squaring added.
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
378 */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
379 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
380 {
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
381 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
382 int s __attribute__((aligned(16)));
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
383 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
384 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
385 vector unsigned char t1, t2, t3,t4, t5;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
386 vector unsigned int sum;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
387 vector signed int sumsqr;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
388
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
389 sum = (vector unsigned int)vec_splat_u32(0);
1277
f3152eb76f1a altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents: 1064
diff changeset
390
f3152eb76f1a altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents: 1064
diff changeset
391 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
f3152eb76f1a altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents: 1064
diff changeset
392
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
393
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
394 for(i=0;i<8;i++) {
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
395 /* Read potentially unaligned pixels into t1 and t2
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
396 Since we're reading 16 pixels, and actually only want 8,
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
397 mask out the last 8 pixels. The 0s don't change the sum. */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
398 perm1 = vec_lvsl(0, pix1);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
399 pix1v = (vector unsigned char *) pix1;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
400 perm2 = vec_lvsl(0, pix2);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
401 pix2v = (vector unsigned char *) pix2;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
402 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
403 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
404
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
405 /*
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
406 Since we want to use unsigned chars, we can take advantage
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
407 of the fact that abs(a-b)^2 = (a-b)^2.
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
408 */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
409
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
410 /* Calculate abs differences vector */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
411 t3 = vec_max(t1, t2);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
412 t4 = vec_min(t1, t2);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
413 t5 = vec_sub(t3, t4);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
414
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
415 /* Square the values and add them to our sum */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
416 sum = vec_msum(t5, t5, sum);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
417
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
418 pix1 += line_size;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
419 pix2 += line_size;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
420 }
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
421
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
422 /* Sum up the four partial sums, and put the result into s */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
423 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
424 sumsqr = vec_splat(sumsqr, 3);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
425 vec_ste(sumsqr, 0, &s);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
426
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
427 return s;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
428 }
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
429
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
430 /**
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
431 * Sum of Squared Errors for a 16x16 block.
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
432 * AltiVec-enhanced.
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
433 * It's the pix_abs16x16_altivec code above w/ squaring added.
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
434 */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
435 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
436 {
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
437 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
438 int s __attribute__((aligned(16)));
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
439 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
440 vector unsigned char perm1, perm2, *pix1v, *pix2v;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
441 vector unsigned char t1, t2, t3,t4, t5;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
442 vector unsigned int sum;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
443 vector signed int sumsqr;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
444
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
445 sum = (vector unsigned int)vec_splat_u32(0);
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
446
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
447 for(i=0;i<16;i++) {
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
448 /* Read potentially unaligned pixels into t1 and t2 */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
449 perm1 = vec_lvsl(0, pix1);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
450 pix1v = (vector unsigned char *) pix1;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
451 perm2 = vec_lvsl(0, pix2);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
452 pix2v = (vector unsigned char *) pix2;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
453 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
454 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
455
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
456 /*
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
457 Since we want to use unsigned chars, we can take advantage
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
458 of the fact that abs(a-b)^2 = (a-b)^2.
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
459 */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
460
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
461 /* Calculate abs differences vector */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
462 t3 = vec_max(t1, t2);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
463 t4 = vec_min(t1, t2);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
464 t5 = vec_sub(t3, t4);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
465
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
466 /* Square the values and add them to our sum */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
467 sum = vec_msum(t5, t5, sum);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
468
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
469 pix1 += line_size;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
470 pix2 += line_size;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
471 }
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
472
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
473 /* Sum up the four partial sums, and put the result into s */
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
474 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
475 sumsqr = vec_splat(sumsqr, 3);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
476 vec_ste(sumsqr, 0, &s);
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
477
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
478 return s;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
479 }
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
480
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1033
diff changeset
481 int pix_sum_altivec(uint8_t * pix, int line_size)
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
482 {
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
483 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
484 vector unsigned char perm, *pixv;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
485 vector unsigned char t1;
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
486 vector unsigned int sad;
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
487 vector signed int sumdiffs;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
488
981
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
489 int i;
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
490 int s __attribute__((aligned(16)));
8bec850dc9c7 altivec patches by Romain Dolbeau
bellard
parents: 978
diff changeset
491
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
492 sad = (vector unsigned int)vec_splat_u32(0);
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
493
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
494 for (i = 0; i < 16; i++) {
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
495 /* Read the potentially unaligned 16 pixels into t1 */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
496 perm = vec_lvsl(0, pix);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
497 pixv = (vector unsigned char *) pix;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
498 t1 = vec_perm(pixv[0], pixv[1], perm);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
499
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
500 /* Add each 4 pixel group together and put 4 results into sad */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
501 sad = vec_sum4s(t1, sad);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
502
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
503 pix += line_size;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
504 }
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
505
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
506 /* Sum up the four partial sums, and put the result into s */
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
507 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
508 sumdiffs = vec_splat(sumdiffs, 3);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
509 vec_ste(sumdiffs, 0, &s);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
510
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
511 return s;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
512 }
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
513
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1033
diff changeset
514 void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
828
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
515 {
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
516 int i;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
517 vector unsigned char perm, bytes, *pixv;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
518 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
828
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
519 vector signed short shorts;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
520
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
521 for(i=0;i<8;i++)
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
522 {
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
523 // Read potentially unaligned pixels.
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
524 // We're reading 16 pixels, and actually only want 8,
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
525 // but we simply ignore the extras.
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
526 perm = vec_lvsl(0, pixels);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
527 pixv = (vector unsigned char *) pixels;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
528 bytes = vec_perm(pixv[0], pixv[1], perm);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
529
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
530 // convert the bytes into shorts
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
531 shorts = (vector signed short)vec_mergeh(zero, bytes);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
532
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
533 // save the data to the block, we assume the block is 16-byte aligned
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
534 vec_st(shorts, i*16, (vector signed short*)block);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
535
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
536 pixels += line_size;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
537 }
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
538 }
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
539
1064
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1033
diff changeset
540 void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
b32afefe7d33 * UINTX -> uintx_t INTX -> intx_t
kabi
parents: 1033
diff changeset
541 const uint8_t *s2, int stride)
828
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
542 {
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
543 int i;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
544 vector unsigned char perm, bytes, *pixv;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
545 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
828
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
546 vector signed short shorts1, shorts2;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
547
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
548 for(i=0;i<4;i++)
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
549 {
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
550 // Read potentially unaligned pixels
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
551 // We're reading 16 pixels, and actually only want 8,
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
552 // but we simply ignore the extras.
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
553 perm = vec_lvsl(0, s1);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
554 pixv = (vector unsigned char *) s1;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
555 bytes = vec_perm(pixv[0], pixv[1], perm);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
556
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
557 // convert the bytes into shorts
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
558 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
559
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
560 // Do the same for the second block of pixels
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
561 perm = vec_lvsl(0, s2);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
562 pixv = (vector unsigned char *) s2;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
563 bytes = vec_perm(pixv[0], pixv[1], perm);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
564
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
565 // convert the bytes into shorts
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
566 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
567
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
568 // Do the subtraction
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
569 shorts1 = vec_sub(shorts1, shorts2);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
570
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
571 // save the data to the block, we assume the block is 16-byte aligned
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
572 vec_st(shorts1, 0, (vector signed short*)block);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
573
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
574 s1 += stride;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
575 s2 += stride;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
576 block += 8;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
577
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
578
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
579 // The code below is a copy of the code above... This is a manual
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
580 // unroll.
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
581
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
582 // Read potentially unaligned pixels
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
583 // We're reading 16 pixels, and actually only want 8,
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
584 // but we simply ignore the extras.
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
585 perm = vec_lvsl(0, s1);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
586 pixv = (vector unsigned char *) s1;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
587 bytes = vec_perm(pixv[0], pixv[1], perm);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
588
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
589 // convert the bytes into shorts
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
590 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
591
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
592 // Do the same for the second block of pixels
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
593 perm = vec_lvsl(0, s2);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
594 pixv = (vector unsigned char *) s2;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
595 bytes = vec_perm(pixv[0], pixv[1], perm);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
596
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
597 // convert the bytes into shorts
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
598 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
599
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
600 // Do the subtraction
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
601 shorts1 = vec_sub(shorts1, shorts2);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
602
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
603 // save the data to the block, we assume the block is 16-byte aligned
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
604 vec_st(shorts1, 0, (vector signed short*)block);
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
605
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
606 s1 += stride;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
607 s2 += stride;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
608 block += 8;
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
609 }
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
610 }
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
611
995
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
612 int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
613 return pix_abs16x16_altivec(a,b,stride);
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
614 }
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
615
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
616 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
617 return pix_abs8x8_altivec(a,b,stride);
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
618 }
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
619
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
620 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
621 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
995
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
622 int i;
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
623 for(i=0; i+7<w; i++){
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
624 dst[i+0] += src[i+0];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
625 dst[i+1] += src[i+1];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
626 dst[i+2] += src[i+2];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
627 dst[i+3] += src[i+3];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
628 dst[i+4] += src[i+4];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
629 dst[i+5] += src[i+5];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
630 dst[i+6] += src[i+6];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
631 dst[i+7] += src[i+7];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
632 }
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
633 for(; i<w; i++)
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
634 dst[i+0] += src[i+0];
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
635 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
995
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
636 register int i;
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
637 register vector unsigned char vdst, vsrc;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
638
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
639 /* dst and src are 16 bytes-aligned (guaranteed) */
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
640 for(i = 0 ; (i + 15) < w ; i++)
995
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
641 {
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
642 vdst = vec_ld(i << 4, (unsigned char*)dst);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
643 vsrc = vec_ld(i << 4, (unsigned char*)src);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
644 vdst = vec_add(vsrc, vdst);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
645 vec_st(vdst, i << 4, (unsigned char*)dst);
995
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
646 }
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
647 /* if w is not a multiple of 16 */
995
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
648 for (; (i < w) ; i++)
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
649 {
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
650 dst[i] = src[i];
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
651 }
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
652 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
653 }
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
654
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
655 /* next one assumes that ((line_size % 16) == 0) */
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
656 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
657 {
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
658 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
659 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
660 int i;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
661
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
662 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
663
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
664 for(i=0; i<h; i++) {
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
665 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
666 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
667 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
668 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
669 pixels+=line_size;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
670 block +=line_size;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
671 }
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
672
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
673 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
674
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
675 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
676 register vector unsigned char pixelsv1, pixelsv2;
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
677 register vector unsigned char pixelsv1B, pixelsv2B;
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
678 register vector unsigned char pixelsv1C, pixelsv2C;
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
679 register vector unsigned char pixelsv1D, pixelsv2D;
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
680
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
681 register vector unsigned char perm = vec_lvsl(0, pixels);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
682 int i;
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
683 register int line_size_2 = line_size << 1;
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
684 register int line_size_3 = line_size + line_size_2;
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
685 register int line_size_4 = line_size << 2;
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
686
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
687 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
688 // hand-unrolling the loop by 4 gains about 15%
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
689 // mininum execution time goes from 74 to 60 cycles
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
690 // it's faster than -funroll-loops, but using
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
691 // -funroll-loops w/ this is bad - 74 cycles again.
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
692 // all this is on a 7450, tuning for the 7450
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
693 #if 0
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
694 for(i=0; i<h; i++) {
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
695 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
696 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
697 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
698 0, (unsigned char*)block);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
699 pixels+=line_size;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
700 block +=line_size;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
701 }
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
702 #else
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
703 for(i=0; i<h; i+=4) {
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
704 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
705 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
706 pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
707 pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
708 pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
709 pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
710 pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
711 pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
712 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
713 0, (unsigned char*)block);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
714 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
715 line_size, (unsigned char*)block);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
716 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
717 line_size_2, (unsigned char*)block);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
718 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
719 line_size_3, (unsigned char*)block);
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
720 pixels+=line_size_4;
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
721 block +=line_size_4;
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
722 }
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
723 #endif
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
724 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
725
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
726 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
727 }
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
728
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
729 /* next one assumes that ((line_size % 16) == 0) */
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
730 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
731 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
732 {
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
733 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
734 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
735 int i;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
736
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
737 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
738
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
739 for(i=0; i<h; i++) {
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
740 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
741 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
742 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
743 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
744 pixels+=line_size;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
745 block +=line_size;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
746 }
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
747
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
748 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
749
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
750 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
751 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
752 register vector unsigned char perm = vec_lvsl(0, pixels);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
753 int i;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
754
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
755 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
756
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
757 for(i=0; i<h; i++) {
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
758 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
759 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
760 blockv = vec_ld(0, block);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
761 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
762 blockv = vec_avg(blockv,pixelsv);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
763 vec_st(blockv, 0, (unsigned char*)block);
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
764 pixels+=line_size;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
765 block +=line_size;
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
766 }
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
767
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
768 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
769
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
770 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
771 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
772
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
773 /* next one assumes that ((line_size % 8) == 0) */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
774 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
775 {
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
776 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
777 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
778 int i;
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
779 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
780 for (i = 0; i < h; i++) {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
781 *((uint32_t *) (block)) =
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
782 (((*((uint32_t *) (block))) |
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
783 ((((const struct unaligned_32 *) (pixels))->l))) -
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
784 ((((*((uint32_t *) (block))) ^
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
785 ((((const struct unaligned_32 *) (pixels))->
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
786 l))) & 0xFEFEFEFEUL) >> 1));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
787 *((uint32_t *) (block + 4)) =
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
788 (((*((uint32_t *) (block + 4))) |
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
789 ((((const struct unaligned_32 *) (pixels + 4))->l))) -
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
790 ((((*((uint32_t *) (block + 4))) ^
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
791 ((((const struct unaligned_32 *) (pixels +
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
792 4))->
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
793 l))) & 0xFEFEFEFEUL) >> 1));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
794 pixels += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
795 block += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
796 }
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
797 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
798
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
799 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
800 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
801 int i;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
802
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
803 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
804
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
805 for (i = 0; i < h; i++) {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
806 /*
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
807 block is 8 bytes-aligned, so we're either in the
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
808 left block (16 bytes-aligned) or in the right block (not)
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
809 */
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
810 int rightside = ((unsigned long)block & 0x0000000F);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
811
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
812 blockv = vec_ld(0, block);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
813 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
814 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
815 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
816
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
817 if (rightside)
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
818 {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
819 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
820 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
821 else
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
822 {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
823 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
824 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
825
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
826 blockv = vec_avg(blockv, pixelsv);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
827
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
828 vec_st(blockv, 0, block);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
829
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
830 pixels += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
831 block += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
832 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
833
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
834 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
835
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
836 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
837 }
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
838
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
839 /* next one assumes that ((line_size % 8) == 0) */
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
840 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
841 {
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
842 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
843 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
844 int j;
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
845 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
846 for (j = 0; j < 2; j++) {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
847 int i;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
848 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
849 const uint32_t b =
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
850 (((const struct unaligned_32 *) (pixels + 1))->l);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
851 uint32_t l0 =
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
852 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
853 uint32_t h0 =
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
854 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
855 uint32_t l1, h1;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
856 pixels += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
857 for (i = 0; i < h; i += 2) {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
858 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
859 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
860 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
861 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
862 *((uint32_t *) block) =
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
863 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
864 pixels += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
865 block += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
866 a = (((const struct unaligned_32 *) (pixels))->l);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
867 b = (((const struct unaligned_32 *) (pixels + 1))->l);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
868 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
869 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
870 *((uint32_t *) block) =
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
871 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
872 pixels += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
873 block += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
874 } pixels += 4 - line_size * (h + 1);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
875 block += 4 - line_size * h;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
876 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
877
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
878 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
879
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
880 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
881 register int i;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
882 register vector unsigned char
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
883 pixelsv1, pixelsv2,
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
884 pixelsavg;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
885 register vector unsigned char
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
886 blockv, temp1, temp2;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
887 register vector unsigned short
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
888 pixelssum1, pixelssum2, temp3;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
889 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
890 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
891
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
892 temp1 = vec_ld(0, pixels);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
893 temp2 = vec_ld(16, pixels);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
894 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
895 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
896 {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
897 pixelsv2 = temp2;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
898 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
899 else
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
900 {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
901 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
902 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
903 pixelsv1 = vec_mergeh(vczero, pixelsv1);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
904 pixelsv2 = vec_mergeh(vczero, pixelsv2);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
905 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
906 (vector unsigned short)pixelsv2);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
907 pixelssum1 = vec_add(pixelssum1, vctwo);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
908
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
909 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
1015
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
910 for (i = 0; i < h ; i++) {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
911 int rightside = ((unsigned long)block & 0x0000000F);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
912 blockv = vec_ld(0, block);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
913
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
914 temp1 = vec_ld(line_size, pixels);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
915 temp2 = vec_ld(line_size + 16, pixels);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
916 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
917 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
918 {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
919 pixelsv2 = temp2;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
920 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
921 else
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
922 {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
923 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
924 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
925
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
926 pixelsv1 = vec_mergeh(vczero, pixelsv1);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
927 pixelsv2 = vec_mergeh(vczero, pixelsv2);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
928 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
929 (vector unsigned short)pixelsv2);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
930 temp3 = vec_add(pixelssum1, pixelssum2);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
931 temp3 = vec_sra(temp3, vctwo);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
932 pixelssum1 = vec_add(pixelssum2, vctwo);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
933 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
934
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
935 if (rightside)
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
936 {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
937 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
938 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
939 else
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
940 {
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
941 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
942 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
943
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
944 vec_st(blockv, 0, block);
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
945
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
946 block += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
947 pixels += line_size;
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
948 }
35cf2f4a0f8c PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1009
diff changeset
949
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
950 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
1009
3b7cc8e4b83f AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 995
diff changeset
951 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
995
edc10966b081 altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents: 981
diff changeset
952 }
828
ace3ccd18dd2 Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents: 638
diff changeset
953
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
954 /* next one assumes that ((line_size % 8) == 0) */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
955 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
956 {
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
957 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
958 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
959 int j;
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
960 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
961 for (j = 0; j < 2; j++) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
962 int i;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
963 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
964 const uint32_t b =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
965 (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
966 uint32_t l0 =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
967 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
968 uint32_t h0 =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
969 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
970 uint32_t l1, h1;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
971 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
972 for (i = 0; i < h; i += 2) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
973 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
974 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
975 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
976 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
977 *((uint32_t *) block) =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
978 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
979 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
980 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
981 a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
982 b = (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
983 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
984 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
985 *((uint32_t *) block) =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
986 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
987 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
988 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
989 } pixels += 4 - line_size * (h + 1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
990 block += 4 - line_size * h;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
991 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
992
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
993 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
994
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
995 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
996 register int i;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
997 register vector unsigned char
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
998 pixelsv1, pixelsv2,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
999 pixelsavg;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1000 register vector unsigned char
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1001 blockv, temp1, temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1002 register vector unsigned short
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1003 pixelssum1, pixelssum2, temp3;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1004 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1005 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1006 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1007
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1008 temp1 = vec_ld(0, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1009 temp2 = vec_ld(16, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1010 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1011 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1012 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1013 pixelsv2 = temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1014 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1015 else
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1016 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1017 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1018 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1019 pixelsv1 = vec_mergeh(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1020 pixelsv2 = vec_mergeh(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1021 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1022 (vector unsigned short)pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1023 pixelssum1 = vec_add(pixelssum1, vcone);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1024
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1025 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1026 for (i = 0; i < h ; i++) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1027 int rightside = ((unsigned long)block & 0x0000000F);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1028 blockv = vec_ld(0, block);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1029
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1030 temp1 = vec_ld(line_size, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1031 temp2 = vec_ld(line_size + 16, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1032 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1033 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1034 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1035 pixelsv2 = temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1036 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1037 else
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1038 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1039 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1040 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1041
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1042 pixelsv1 = vec_mergeh(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1043 pixelsv2 = vec_mergeh(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1044 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1045 (vector unsigned short)pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1046 temp3 = vec_add(pixelssum1, pixelssum2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1047 temp3 = vec_sra(temp3, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1048 pixelssum1 = vec_add(pixelssum2, vcone);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1049 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1050
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1051 if (rightside)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1052 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1053 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1054 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1055 else
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1056 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1057 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1058 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1059
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1060 vec_st(blockv, 0, block);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1061
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1062 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1063 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1064 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1065
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1066 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1067 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1068 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1069
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1070 /* next one assumes that ((line_size % 16) == 0) */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1071 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1072 {
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1073 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1074 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1075 int j;
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1076 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1077 for (j = 0; j < 4; j++) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1078 int i;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1079 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1080 const uint32_t b =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1081 (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1082 uint32_t l0 =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1083 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1084 uint32_t h0 =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1085 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1086 uint32_t l1, h1;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1087 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1088 for (i = 0; i < h; i += 2) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1089 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1090 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1091 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1092 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1093 *((uint32_t *) block) =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1094 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1095 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1096 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1097 a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1098 b = (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1099 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1100 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1101 *((uint32_t *) block) =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1102 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1103 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1104 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1105 } pixels += 4 - line_size * (h + 1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1106 block += 4 - line_size * h;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1107 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1108
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1109 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1110
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1111 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1112 register int i;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1113 register vector unsigned char
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1114 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1115 register vector unsigned char
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1116 blockv, temp1, temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1117 register vector unsigned short
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1118 pixelssum1, pixelssum2, temp3,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1119 pixelssum3, pixelssum4, temp4;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1120 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1121 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1340
09b8fe0f0139 PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1277
diff changeset
1122
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1123 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1340
09b8fe0f0139 PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1277
diff changeset
1124
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1125 temp1 = vec_ld(0, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1126 temp2 = vec_ld(16, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1127 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1128 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1129 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1130 pixelsv2 = temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1131 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1132 else
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1133 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1134 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1135 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1136 pixelsv3 = vec_mergel(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1137 pixelsv4 = vec_mergel(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1138 pixelsv1 = vec_mergeh(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1139 pixelsv2 = vec_mergeh(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1140 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1141 (vector unsigned short)pixelsv4);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1142 pixelssum3 = vec_add(pixelssum3, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1143 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1144 (vector unsigned short)pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1145 pixelssum1 = vec_add(pixelssum1, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1146
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1147 for (i = 0; i < h ; i++) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1148 blockv = vec_ld(0, block);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1149
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1150 temp1 = vec_ld(line_size, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1151 temp2 = vec_ld(line_size + 16, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1152 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1153 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1154 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1155 pixelsv2 = temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1156 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1157 else
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1158 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1159 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1160 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1161
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1162 pixelsv3 = vec_mergel(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1163 pixelsv4 = vec_mergel(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1164 pixelsv1 = vec_mergeh(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1165 pixelsv2 = vec_mergeh(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1166
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1167 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1168 (vector unsigned short)pixelsv4);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1169 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1170 (vector unsigned short)pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1171 temp4 = vec_add(pixelssum3, pixelssum4);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1172 temp4 = vec_sra(temp4, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1173 temp3 = vec_add(pixelssum1, pixelssum2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1174 temp3 = vec_sra(temp3, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1175
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1176 pixelssum3 = vec_add(pixelssum4, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1177 pixelssum1 = vec_add(pixelssum2, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1178
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1179 blockv = vec_packsu(temp3, temp4);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1180
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1181 vec_st(blockv, 0, block);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1182
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1183 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1184 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1185 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1186
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1187 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1188 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1189 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1190
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1191 /* next one assumes that ((line_size % 16) == 0) */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1192 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1193 {
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1194 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1195 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1196 int j;
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1197 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1198 for (j = 0; j < 4; j++) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1199 int i;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1200 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1201 const uint32_t b =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1202 (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1203 uint32_t l0 =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1204 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1205 uint32_t h0 =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1206 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1207 uint32_t l1, h1;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1208 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1209 for (i = 0; i < h; i += 2) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1210 uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1211 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1212 l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1213 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1214 *((uint32_t *) block) =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1215 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1216 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1217 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1218 a = (((const struct unaligned_32 *) (pixels))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1219 b = (((const struct unaligned_32 *) (pixels + 1))->l);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1220 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1221 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1222 *((uint32_t *) block) =
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1223 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1224 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1225 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1226 } pixels += 4 - line_size * (h + 1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1227 block += 4 - line_size * h;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1228 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1229
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1230 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1231
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1232 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1233 register int i;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1234 register vector unsigned char
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1235 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1236 register vector unsigned char
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1237 blockv, temp1, temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1238 register vector unsigned short
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1239 pixelssum1, pixelssum2, temp3,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1240 pixelssum3, pixelssum4, temp4;
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1241 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1242 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1243 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1340
09b8fe0f0139 PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1277
diff changeset
1244
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1245 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1340
09b8fe0f0139 PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1277
diff changeset
1246
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1247 temp1 = vec_ld(0, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1248 temp2 = vec_ld(16, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1249 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1250 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1251 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1252 pixelsv2 = temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1253 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1254 else
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1255 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1256 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1257 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1258 pixelsv3 = vec_mergel(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1259 pixelsv4 = vec_mergel(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1260 pixelsv1 = vec_mergeh(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1261 pixelsv2 = vec_mergeh(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1262 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1263 (vector unsigned short)pixelsv4);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1264 pixelssum3 = vec_add(pixelssum3, vcone);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1265 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1266 (vector unsigned short)pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1267 pixelssum1 = vec_add(pixelssum1, vcone);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1268
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1269 for (i = 0; i < h ; i++) {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1270 blockv = vec_ld(0, block);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1271
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1272 temp1 = vec_ld(line_size, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1273 temp2 = vec_ld(line_size + 16, pixels);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1274 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1275 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1276 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1277 pixelsv2 = temp2;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1278 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1279 else
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1280 {
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1281 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1282 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1283
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1284 pixelsv3 = vec_mergel(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1285 pixelsv4 = vec_mergel(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1286 pixelsv1 = vec_mergeh(vczero, pixelsv1);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1287 pixelsv2 = vec_mergeh(vczero, pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1288
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1289 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1290 (vector unsigned short)pixelsv4);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1291 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1292 (vector unsigned short)pixelsv2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1293 temp4 = vec_add(pixelssum3, pixelssum4);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1294 temp4 = vec_sra(temp4, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1295 temp3 = vec_add(pixelssum1, pixelssum2);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1296 temp3 = vec_sra(temp3, vctwo);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1297
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1298 pixelssum3 = vec_add(pixelssum4, vcone);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1299 pixelssum1 = vec_add(pixelssum2, vcone);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1300
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1301 blockv = vec_packsu(temp3, temp4);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1302
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1303 vec_st(blockv, 0, block);
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1304
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1305 block += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1306 pixels += line_size;
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1307 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1308
1352
e8ff4783f188 1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents: 1340
diff changeset
1309 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1024
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1310 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1311 }
9cc1031e1864 More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents: 1015
diff changeset
1312
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1313 int has_altivec(void)
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1314 {
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1315 #ifdef CONFIG_DARWIN
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1316 int sels[2] = {CTL_HW, HW_VECTORUNIT};
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1317 int has_vu = 0;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1318 size_t len = sizeof(has_vu);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1319 int err;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1320
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1321 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1322
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1323 if (err == 0) return (has_vu != 0);
1033
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1324 #else /* CONFIG_DARWIN */
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1325 /* no Darwin, do it the brute-force way */
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1326 /* this is borrowed from the libmpeg2 library */
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1327 {
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1328 signal (SIGILL, sigill_handler);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1329 if (sigsetjmp (jmpbuf, 1)) {
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1330 signal (SIGILL, SIG_DFL);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1331 } else {
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1332 canjump = 1;
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1333
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1334 asm volatile ("mtspr 256, %0\n\t"
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1335 "vand %%v0, %%v0, %%v0"
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1336 :
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1337 : "r" (-1));
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1338
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1339 signal (SIGILL, SIG_DFL);
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1340 return 1;
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1341 }
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1342 }
b4172ff70d27 Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents: 1024
diff changeset
1343 #endif /* CONFIG_DARWIN */
623
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1344 return 0;
92e99e506920 first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff changeset
1345 }