Mercurial > libavcodec.hg
annotate ppc/dsputil_altivec.c @ 1708:dea5b2946999 libavcodec
interlaced motion estimation
interlaced mpeg2 encoding
P & B frames
rate distored interlaced mb decission
alternate scantable support
4mv encoding fixes (thats also why the regression tests change)
passing height to most dsp functions
interlaced mpeg4 encoding (no direct mode MBs yet)
various related cleanups
disabled old motion estimaton algorithms (log, full, ...) they will either be fixed or removed
author | michael |
---|---|
date | Tue, 30 Dec 2003 16:07:57 +0000 |
parents | e8ff4783f188 |
children | b370288f004d |
rev | line source |
---|---|
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
1 /* |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
2 * Copyright (c) 2002 Brian Foley |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
3 * Copyright (c) 2002 Dieter Shirley |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
5 * |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
6 * This library is free software; you can redistribute it and/or |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
7 * modify it under the terms of the GNU Lesser General Public |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
8 * License as published by the Free Software Foundation; either |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
9 * version 2 of the License, or (at your option) any later version. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
10 * |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
11 * This library is distributed in the hope that it will be useful, |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
14 * Lesser General Public License for more details. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
15 * |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
16 * You should have received a copy of the GNU Lesser General Public |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
17 * License along with this library; if not, write to the Free Software |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
19 */ |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
20 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
21 #include "../dsputil.h" |
1277
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
22 |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
23 #include "gcc_fixes.h" |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
24 |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
25 #include "dsputil_altivec.h" |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
26 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
27 #ifdef CONFIG_DARWIN |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
28 #include <sys/sysctl.h> |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
29 #else /* CONFIG_DARWIN */ |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
30 #include <signal.h> |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
31 #include <setjmp.h> |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
32 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
33 static sigjmp_buf jmpbuf; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
34 static volatile sig_atomic_t canjump = 0; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
35 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
36 static void sigill_handler (int sig) |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
37 { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
38 if (!canjump) { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
39 signal (sig, SIG_DFL); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
40 raise (sig); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
41 } |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
42 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
43 canjump = 0; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
44 siglongjmp (jmpbuf, 1); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
45 } |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
46 #endif /* CONFIG_DARWIN */ |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
47 |
1708 | 48 int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
49 { |
981 | 50 int i; |
51 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
52 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
981 | 53 vector unsigned char *tv; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
54 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
55 vector unsigned int sad; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
56 vector signed int sumdiffs; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
57 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
58 s = 0; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
59 sad = (vector unsigned int)vec_splat_u32(0); |
1708 | 60 for(i=0;i<h;i++) { |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
61 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
62 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
63 pix1v: pix1[0]-pix1[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
64 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
65 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
66 tv = (vector unsigned char *) pix1; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
67 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
68 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
69 tv = (vector unsigned char *) &pix2[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
70 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
71 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
72 tv = (vector unsigned char *) &pix2[1]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
73 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
74 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
75 /* Calculate the average vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
76 avgv = vec_avg(pix2v, pix2iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
77 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
78 /* Calculate a sum of abs differences vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
79 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
80 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
81 /* Add each 4 pixel group together and put 4 results into sad */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
82 sad = vec_sum4s(t5, sad); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
83 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
84 pix1 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
85 pix2 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
86 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
87 /* Sum up the four partial sums, and put the result into s */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
88 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
89 sumdiffs = vec_splat(sumdiffs, 3); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
90 vec_ste(sumdiffs, 0, &s); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
91 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
92 return s; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
93 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
94 |
1708 | 95 int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
96 { |
981 | 97 int i; |
98 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
99 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
981 | 100 vector unsigned char *tv; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
101 vector unsigned char pix1v, pix2v, pix3v, avgv, t5; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
102 vector unsigned int sad; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
103 vector signed int sumdiffs; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
104 uint8_t *pix3 = pix2 + line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
105 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
106 s = 0; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
107 sad = (vector unsigned int)vec_splat_u32(0); |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
108 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
109 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
110 Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
111 iteration becomes pix2 in the next iteration. We can use this |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
112 fact to avoid a potentially expensive unaligned read, each |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
113 time around the loop. |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
114 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
115 pix2v: pix2[0]-pix2[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
116 Split the pixel vectors into shorts |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
117 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
118 tv = (vector unsigned char *) &pix2[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
119 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
120 |
1708 | 121 for(i=0;i<h;i++) { |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
122 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
123 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
124 pix1v: pix1[0]-pix1[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
125 pix3v: pix3[0]-pix3[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
126 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
127 tv = (vector unsigned char *) pix1; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
128 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
129 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
130 tv = (vector unsigned char *) &pix3[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
131 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
132 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
133 /* Calculate the average vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
134 avgv = vec_avg(pix2v, pix3v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
135 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
136 /* Calculate a sum of abs differences vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
137 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
138 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
139 /* Add each 4 pixel group together and put 4 results into sad */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
140 sad = vec_sum4s(t5, sad); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
141 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
142 pix1 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
143 pix2v = pix3v; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
144 pix3 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
145 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
146 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
147 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
148 /* Sum up the four partial sums, and put the result into s */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
149 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
150 sumdiffs = vec_splat(sumdiffs, 3); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
151 vec_ste(sumdiffs, 0, &s); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
152 return s; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
153 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
154 |
1708 | 155 int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
156 { |
981 | 157 int i; |
158 int s __attribute__((aligned(16))); | |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
159 uint8_t *pix3 = pix2 + line_size; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
160 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
161 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); |
981 | 162 vector unsigned char *tv, avgv, t5; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
163 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
164 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
165 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; |
981 | 166 vector unsigned short avghv, avglv; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
167 vector unsigned short t1, t2, t3, t4; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
168 vector unsigned int sad; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
169 vector signed int sumdiffs; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
170 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
171 sad = (vector unsigned int)vec_splat_u32(0); |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
172 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
173 s = 0; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
174 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
175 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
176 Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
177 iteration becomes pix2 in the next iteration. We can use this |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
178 fact to avoid a potentially expensive unaligned read, as well |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
179 as some splitting, and vector addition each time around the loop. |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
180 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
181 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
182 Split the pixel vectors into shorts |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
183 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
184 tv = (vector unsigned char *) &pix2[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
185 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
186 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
187 tv = (vector unsigned char *) &pix2[1]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
188 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
189 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
190 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
191 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
192 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
193 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
194 t1 = vec_add(pix2hv, pix2ihv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
195 t2 = vec_add(pix2lv, pix2ilv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
196 |
1708 | 197 for(i=0;i<h;i++) { |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
198 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
199 Read unaligned pixels into our vectors. The vectors are as follows: |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
200 pix1v: pix1[0]-pix1[15] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
201 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
202 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
203 tv = (vector unsigned char *) pix1; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
204 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
205 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
206 tv = (vector unsigned char *) &pix3[0]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
207 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
208 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
209 tv = (vector unsigned char *) &pix3[1]; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
210 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
211 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
212 /* |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
213 Note that Altivec does have vec_avg, but this works on vector pairs |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
214 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
215 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
216 Instead, we have to split the pixel vectors into vectors of shorts, |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
217 and do the averaging by hand. |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
218 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
219 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
220 /* Split the pixel vectors into shorts */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
221 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
222 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
223 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
224 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
225 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
226 /* Do the averaging on them */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
227 t3 = vec_add(pix3hv, pix3ihv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
228 t4 = vec_add(pix3lv, pix3ilv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
229 |
884 | 230 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
231 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); | |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
232 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
233 /* Pack the shorts back into a result */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
234 avgv = vec_pack(avghv, avglv); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
235 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
236 /* Calculate a sum of abs differences vector */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
237 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
238 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
239 /* Add each 4 pixel group together and put 4 results into sad */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
240 sad = vec_sum4s(t5, sad); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
241 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
242 pix1 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
243 pix3 += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
244 /* Transfer the calculated values for pix3 into pix2 */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
245 t1 = t3; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
246 t2 = t4; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
247 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
248 /* Sum up the four partial sums, and put the result into s */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
249 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
250 sumdiffs = vec_splat(sumdiffs, 3); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
251 vec_ste(sumdiffs, 0, &s); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
252 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
253 return s; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
254 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
255 |
1708 | 256 int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
257 { |
981 | 258 int i; |
259 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
260 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
261 vector unsigned char perm1, perm2, *pix1v, *pix2v; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
262 vector unsigned char t1, t2, t3,t4, t5; |
981 | 263 vector unsigned int sad; |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
264 vector signed int sumdiffs; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
265 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
266 sad = (vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
267 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
268 |
1708 | 269 for(i=0;i<h;i++) { |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
270 /* Read potentially unaligned pixels into t1 and t2 */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
271 perm1 = vec_lvsl(0, pix1); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
272 pix1v = (vector unsigned char *) pix1; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
273 perm2 = vec_lvsl(0, pix2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
274 pix2v = (vector unsigned char *) pix2; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
275 t1 = vec_perm(pix1v[0], pix1v[1], perm1); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
276 t2 = vec_perm(pix2v[0], pix2v[1], perm2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
277 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
278 /* Calculate a sum of abs differences vector */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
279 t3 = vec_max(t1, t2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
280 t4 = vec_min(t1, t2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
281 t5 = vec_sub(t3, t4); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
282 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
283 /* Add each 4 pixel group together and put 4 results into sad */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
284 sad = vec_sum4s(t5, sad); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
285 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
286 pix1 += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
287 pix2 += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
288 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
289 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
290 /* Sum up the four partial sums, and put the result into s */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
291 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
292 sumdiffs = vec_splat(sumdiffs, 3); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
293 vec_ste(sumdiffs, 0, &s); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
294 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
295 return s; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
296 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
297 |
1708 | 298 int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
299 { |
981 | 300 int i; |
301 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
302 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
303 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
304 vector unsigned char t1, t2, t3,t4, t5; |
981 | 305 vector unsigned int sad; |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
306 vector signed int sumdiffs; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
307 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
308 sad = (vector unsigned int)vec_splat_u32(0); |
1277
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
309 |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
310 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
311 |
1708 | 312 for(i=0;i<h;i++) { |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
313 /* Read potentially unaligned pixels into t1 and t2 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
314 Since we're reading 16 pixels, and actually only want 8, |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
315 mask out the last 8 pixels. The 0s don't change the sum. */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
316 perm1 = vec_lvsl(0, pix1); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
317 pix1v = (vector unsigned char *) pix1; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
318 perm2 = vec_lvsl(0, pix2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
319 pix2v = (vector unsigned char *) pix2; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
320 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
321 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
322 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
323 /* Calculate a sum of abs differences vector */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
324 t3 = vec_max(t1, t2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
325 t4 = vec_min(t1, t2); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
326 t5 = vec_sub(t3, t4); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
327 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
328 /* Add each 4 pixel group together and put 4 results into sad */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
329 sad = vec_sum4s(t5, sad); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
330 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
331 pix1 += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
332 pix2 += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
333 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
334 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
335 /* Sum up the four partial sums, and put the result into s */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
336 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
337 sumdiffs = vec_splat(sumdiffs, 3); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
338 vec_ste(sumdiffs, 0, &s); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
339 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
340 return s; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
341 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
342 |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
343 int pix_norm1_altivec(uint8_t *pix, int line_size) |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
344 { |
981 | 345 int i; |
346 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
347 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
981 | 348 vector unsigned char *tv; |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
349 vector unsigned char pixv; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
350 vector unsigned int sv; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
351 vector signed int sum; |
981 | 352 |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
353 sv = (vector unsigned int)vec_splat_u32(0); |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
354 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
355 s = 0; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
356 for (i = 0; i < 16; i++) { |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
357 /* Read in the potentially unaligned pixels */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
358 tv = (vector unsigned char *) pix; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
359 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
360 |
884 | 361 /* Square the values, and add them to our sum */ |
362 sv = vec_msum(pixv, pixv, sv); | |
878
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
363 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
364 pix += line_size; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
365 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
366 /* Sum up the four partial sums, and put the result into s */ |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
367 sum = vec_sums((vector signed int) sv, (vector signed int) zero); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
368 sum = vec_splat(sum, 3); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
369 vec_ste(sum, 0, &s); |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
370 |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
371 return s; |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
372 } |
6ea69518e5f7
altivec optimizations patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
828
diff
changeset
|
373 |
981 | 374 /** |
375 * Sum of Squared Errors for a 8x8 block. | |
376 * AltiVec-enhanced. | |
1708 | 377 * It's the sad8_altivec code above w/ squaring added. |
981 | 378 */ |
1708 | 379 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
981 | 380 { |
381 int i; | |
382 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
383 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
981 | 384 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; |
385 vector unsigned char t1, t2, t3,t4, t5; | |
386 vector unsigned int sum; | |
387 vector signed int sumsqr; | |
388 | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
389 sum = (vector unsigned int)vec_splat_u32(0); |
1277
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
390 |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
391 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); |
f3152eb76f1a
altivec gcc-3 fixes by (Magnus Damm <damm at opensource dot se>)
michaelni
parents:
1064
diff
changeset
|
392 |
981 | 393 |
1708 | 394 for(i=0;i<h;i++) { |
981 | 395 /* Read potentially unaligned pixels into t1 and t2 |
396 Since we're reading 16 pixels, and actually only want 8, | |
397 mask out the last 8 pixels. The 0s don't change the sum. */ | |
398 perm1 = vec_lvsl(0, pix1); | |
399 pix1v = (vector unsigned char *) pix1; | |
400 perm2 = vec_lvsl(0, pix2); | |
401 pix2v = (vector unsigned char *) pix2; | |
402 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); | |
403 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); | |
404 | |
405 /* | |
406 Since we want to use unsigned chars, we can take advantage | |
407 of the fact that abs(a-b)^2 = (a-b)^2. | |
408 */ | |
409 | |
410 /* Calculate abs differences vector */ | |
411 t3 = vec_max(t1, t2); | |
412 t4 = vec_min(t1, t2); | |
413 t5 = vec_sub(t3, t4); | |
414 | |
415 /* Square the values and add them to our sum */ | |
416 sum = vec_msum(t5, t5, sum); | |
417 | |
418 pix1 += line_size; | |
419 pix2 += line_size; | |
420 } | |
421 | |
422 /* Sum up the four partial sums, and put the result into s */ | |
423 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
424 sumsqr = vec_splat(sumsqr, 3); | |
425 vec_ste(sumsqr, 0, &s); | |
426 | |
427 return s; | |
428 } | |
429 | |
430 /** | |
431 * Sum of Squared Errors for a 16x16 block. | |
432 * AltiVec-enhanced. | |
1708 | 433 * It's the sad16_altivec code above w/ squaring added. |
981 | 434 */ |
1708 | 435 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) |
981 | 436 { |
437 int i; | |
438 int s __attribute__((aligned(16))); | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
439 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
981 | 440 vector unsigned char perm1, perm2, *pix1v, *pix2v; |
441 vector unsigned char t1, t2, t3,t4, t5; | |
442 vector unsigned int sum; | |
443 vector signed int sumsqr; | |
444 | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
445 sum = (vector unsigned int)vec_splat_u32(0); |
981 | 446 |
1708 | 447 for(i=0;i<h;i++) { |
981 | 448 /* Read potentially unaligned pixels into t1 and t2 */ |
449 perm1 = vec_lvsl(0, pix1); | |
450 pix1v = (vector unsigned char *) pix1; | |
451 perm2 = vec_lvsl(0, pix2); | |
452 pix2v = (vector unsigned char *) pix2; | |
453 t1 = vec_perm(pix1v[0], pix1v[1], perm1); | |
454 t2 = vec_perm(pix2v[0], pix2v[1], perm2); | |
455 | |
456 /* | |
457 Since we want to use unsigned chars, we can take advantage | |
458 of the fact that abs(a-b)^2 = (a-b)^2. | |
459 */ | |
460 | |
461 /* Calculate abs differences vector */ | |
462 t3 = vec_max(t1, t2); | |
463 t4 = vec_min(t1, t2); | |
464 t5 = vec_sub(t3, t4); | |
465 | |
466 /* Square the values and add them to our sum */ | |
467 sum = vec_msum(t5, t5, sum); | |
468 | |
469 pix1 += line_size; | |
470 pix2 += line_size; | |
471 } | |
472 | |
473 /* Sum up the four partial sums, and put the result into s */ | |
474 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
475 sumsqr = vec_splat(sumsqr, 3); | |
476 vec_ste(sumsqr, 0, &s); | |
477 | |
478 return s; | |
479 } | |
480 | |
1064 | 481 int pix_sum_altivec(uint8_t * pix, int line_size) |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
482 { |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
483 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
484 vector unsigned char perm, *pixv; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
485 vector unsigned char t1; |
981 | 486 vector unsigned int sad; |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
487 vector signed int sumdiffs; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
488 |
981 | 489 int i; |
490 int s __attribute__((aligned(16))); | |
491 | |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
492 sad = (vector unsigned int)vec_splat_u32(0); |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
493 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
494 for (i = 0; i < 16; i++) { |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
495 /* Read the potentially unaligned 16 pixels into t1 */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
496 perm = vec_lvsl(0, pix); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
497 pixv = (vector unsigned char *) pix; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
498 t1 = vec_perm(pixv[0], pixv[1], perm); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
499 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
500 /* Add each 4 pixel group together and put 4 results into sad */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
501 sad = vec_sum4s(t1, sad); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
502 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
503 pix += line_size; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
504 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
505 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
506 /* Sum up the four partial sums, and put the result into s */ |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
507 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
508 sumdiffs = vec_splat(sumdiffs, 3); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
509 vec_ste(sumdiffs, 0, &s); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
510 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
511 return s; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
512 } |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
513 |
1064 | 514 void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
515 { |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
516 int i; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
517 vector unsigned char perm, bytes, *pixv; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
518 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
519 vector signed short shorts; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
520 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
521 for(i=0;i<8;i++) |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
522 { |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
523 // Read potentially unaligned pixels. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
524 // We're reading 16 pixels, and actually only want 8, |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
525 // but we simply ignore the extras. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
526 perm = vec_lvsl(0, pixels); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
527 pixv = (vector unsigned char *) pixels; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
528 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
529 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
530 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
531 shorts = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
532 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
533 // save the data to the block, we assume the block is 16-byte aligned |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
534 vec_st(shorts, i*16, (vector signed short*)block); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
535 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
536 pixels += line_size; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
537 } |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
538 } |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
539 |
1064 | 540 void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, |
541 const uint8_t *s2, int stride) | |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
542 { |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
543 int i; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
544 vector unsigned char perm, bytes, *pixv; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
545 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
546 vector signed short shorts1, shorts2; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
547 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
548 for(i=0;i<4;i++) |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
549 { |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
550 // Read potentially unaligned pixels |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
551 // We're reading 16 pixels, and actually only want 8, |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
552 // but we simply ignore the extras. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
553 perm = vec_lvsl(0, s1); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
554 pixv = (vector unsigned char *) s1; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
555 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
556 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
557 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
558 shorts1 = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
559 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
560 // Do the same for the second block of pixels |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
561 perm = vec_lvsl(0, s2); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
562 pixv = (vector unsigned char *) s2; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
563 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
564 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
565 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
566 shorts2 = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
567 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
568 // Do the subtraction |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
569 shorts1 = vec_sub(shorts1, shorts2); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
570 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
571 // save the data to the block, we assume the block is 16-byte aligned |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
572 vec_st(shorts1, 0, (vector signed short*)block); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
573 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
574 s1 += stride; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
575 s2 += stride; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
576 block += 8; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
577 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
578 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
579 // The code below is a copy of the code above... This is a manual |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
580 // unroll. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
581 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
582 // Read potentially unaligned pixels |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
583 // We're reading 16 pixels, and actually only want 8, |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
584 // but we simply ignore the extras. |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
585 perm = vec_lvsl(0, s1); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
586 pixv = (vector unsigned char *) s1; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
587 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
588 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
589 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
590 shorts1 = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
591 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
592 // Do the same for the second block of pixels |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
593 perm = vec_lvsl(0, s2); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
594 pixv = (vector unsigned char *) s2; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
595 bytes = vec_perm(pixv[0], pixv[1], perm); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
596 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
597 // convert the bytes into shorts |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
598 shorts2 = (vector signed short)vec_mergeh(zero, bytes); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
599 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
600 // Do the subtraction |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
601 shorts1 = vec_sub(shorts1, shorts2); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
602 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
603 // save the data to the block, we assume the block is 16-byte aligned |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
604 vec_st(shorts1, 0, (vector signed short*)block); |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
605 |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
606 s1 += stride; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
607 s2 += stride; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
608 block += 8; |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
609 } |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
610 } |
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
611 |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
612 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
613 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
614 int i; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
615 for(i=0; i+7<w; i++){ |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
616 dst[i+0] += src[i+0]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
617 dst[i+1] += src[i+1]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
618 dst[i+2] += src[i+2]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
619 dst[i+3] += src[i+3]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
620 dst[i+4] += src[i+4]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
621 dst[i+5] += src[i+5]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
622 dst[i+6] += src[i+6]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
623 dst[i+7] += src[i+7]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
624 } |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
625 for(; i<w; i++) |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
626 dst[i+0] += src[i+0]; |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
627 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
628 register int i; |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
629 register vector unsigned char vdst, vsrc; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
630 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
631 /* dst and src are 16 bytes-aligned (guaranteed) */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
632 for(i = 0 ; (i + 15) < w ; i++) |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
633 { |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
634 vdst = vec_ld(i << 4, (unsigned char*)dst); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
635 vsrc = vec_ld(i << 4, (unsigned char*)src); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
636 vdst = vec_add(vsrc, vdst); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
637 vec_st(vdst, i << 4, (unsigned char*)dst); |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
638 } |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
639 /* if w is not a multiple of 16 */ |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
640 for (; (i < w) ; i++) |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
641 { |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
642 dst[i] = src[i]; |
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
643 } |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
644 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
645 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
646 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
647 /* next one assumes that ((line_size % 16) == 0) */ |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
648 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
649 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
650 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
651 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
652 int i; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
653 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
654 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
655 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
656 for(i=0; i<h; i++) { |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
657 *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
658 *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
659 *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
660 *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
661 pixels+=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
662 block +=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
663 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
664 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
665 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
666 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
667 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
668 register vector unsigned char pixelsv1, pixelsv2; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
669 register vector unsigned char pixelsv1B, pixelsv2B; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
670 register vector unsigned char pixelsv1C, pixelsv2C; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
671 register vector unsigned char pixelsv1D, pixelsv2D; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
672 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
673 register vector unsigned char perm = vec_lvsl(0, pixels); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
674 int i; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
675 register int line_size_2 = line_size << 1; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
676 register int line_size_3 = line_size + line_size_2; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
677 register int line_size_4 = line_size << 2; |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
678 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
679 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
680 // hand-unrolling the loop by 4 gains about 15% |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
681 // mininum execution time goes from 74 to 60 cycles |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
682 // it's faster than -funroll-loops, but using |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
683 // -funroll-loops w/ this is bad - 74 cycles again. |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
684 // all this is on a 7450, tuning for the 7450 |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
685 #if 0 |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
686 for(i=0; i<h; i++) { |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
687 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
688 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
689 vec_st(vec_perm(pixelsv1, pixelsv2, perm), |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
690 0, (unsigned char*)block); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
691 pixels+=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
692 block +=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
693 } |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
694 #else |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
695 for(i=0; i<h; i+=4) { |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
696 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
697 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
698 pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
699 pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
700 pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
701 pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
702 pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
703 pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
704 vec_st(vec_perm(pixelsv1, pixelsv2, perm), |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
705 0, (unsigned char*)block); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
706 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
707 line_size, (unsigned char*)block); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
708 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
709 line_size_2, (unsigned char*)block); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
710 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
711 line_size_3, (unsigned char*)block); |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
712 pixels+=line_size_4; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
713 block +=line_size_4; |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
714 } |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
715 #endif |
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
716 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
717 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
718 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
719 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
720 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
721 /* next one assumes that ((line_size % 16) == 0) */ |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
722 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
723 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
724 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
725 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
726 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
727 int i; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
728 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
729 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
730 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
731 for(i=0; i<h; i++) { |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
732 op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
733 op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
734 op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
735 op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
736 pixels+=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
737 block +=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
738 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
739 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
740 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
741 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
742 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
743 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
744 register vector unsigned char perm = vec_lvsl(0, pixels); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
745 int i; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
746 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
747 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
748 |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
749 for(i=0; i<h; i++) { |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
750 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
751 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
752 blockv = vec_ld(0, block); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
753 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
754 blockv = vec_avg(blockv,pixelsv); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
755 vec_st(blockv, 0, (unsigned char*)block); |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
756 pixels+=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
757 block +=line_size; |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
758 } |
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
759 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
760 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
761 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
762 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
763 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
764 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
765 /* next one assumes that ((line_size % 8) == 0) */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
766 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
767 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
768 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
769 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
770 int i; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
771 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
772 for (i = 0; i < h; i++) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
773 *((uint32_t *) (block)) = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
774 (((*((uint32_t *) (block))) | |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
775 ((((const struct unaligned_32 *) (pixels))->l))) - |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
776 ((((*((uint32_t *) (block))) ^ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
777 ((((const struct unaligned_32 *) (pixels))-> |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
778 l))) & 0xFEFEFEFEUL) >> 1)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
779 *((uint32_t *) (block + 4)) = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
780 (((*((uint32_t *) (block + 4))) | |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
781 ((((const struct unaligned_32 *) (pixels + 4))->l))) - |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
782 ((((*((uint32_t *) (block + 4))) ^ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
783 ((((const struct unaligned_32 *) (pixels + |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
784 4))-> |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
785 l))) & 0xFEFEFEFEUL) >> 1)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
786 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
787 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
788 } |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
789 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
790 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
791 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
792 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
793 int i; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
794 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
795 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
796 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
797 for (i = 0; i < h; i++) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
798 /* |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
799 block is 8 bytes-aligned, so we're either in the |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
800 left block (16 bytes-aligned) or in the right block (not) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
801 */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
802 int rightside = ((unsigned long)block & 0x0000000F); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
803 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
804 blockv = vec_ld(0, block); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
805 pixelsv1 = vec_ld(0, (unsigned char*)pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
806 pixelsv2 = vec_ld(16, (unsigned char*)pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
807 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
808 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
809 if (rightside) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
810 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
811 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
812 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
813 else |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
814 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
815 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
816 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
817 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
818 blockv = vec_avg(blockv, pixelsv); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
819 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
820 vec_st(blockv, 0, block); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
821 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
822 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
823 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
824 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
825 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
826 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
827 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
828 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
829 } |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
830 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
831 /* next one assumes that ((line_size % 8) == 0) */ |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
832 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
833 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
834 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
835 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
836 int j; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
837 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
838 for (j = 0; j < 2; j++) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
839 int i; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
840 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
841 const uint32_t b = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
842 (((const struct unaligned_32 *) (pixels + 1))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
843 uint32_t l0 = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
844 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
845 uint32_t h0 = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
846 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
847 uint32_t l1, h1; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
848 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
849 for (i = 0; i < h; i += 2) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
850 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
851 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
852 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
853 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
854 *((uint32_t *) block) = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
855 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
856 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
857 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
858 a = (((const struct unaligned_32 *) (pixels))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
859 b = (((const struct unaligned_32 *) (pixels + 1))->l); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
860 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
861 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
862 *((uint32_t *) block) = |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
863 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
864 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
865 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
866 } pixels += 4 - line_size * (h + 1); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
867 block += 4 - line_size * h; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
868 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
869 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
870 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
871 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
872 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
873 register int i; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
874 register vector unsigned char |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
875 pixelsv1, pixelsv2, |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
876 pixelsavg; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
877 register vector unsigned char |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
878 blockv, temp1, temp2; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
879 register vector unsigned short |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
880 pixelssum1, pixelssum2, temp3; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
881 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
882 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
883 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
884 temp1 = vec_ld(0, pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
885 temp2 = vec_ld(16, pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
886 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
887 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
888 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
889 pixelsv2 = temp2; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
890 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
891 else |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
892 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
893 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
894 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
895 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
896 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
897 pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
898 (vector unsigned short)pixelsv2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
899 pixelssum1 = vec_add(pixelssum1, vctwo); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
900 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
901 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); |
1015
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
902 for (i = 0; i < h ; i++) { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
903 int rightside = ((unsigned long)block & 0x0000000F); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
904 blockv = vec_ld(0, block); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
905 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
906 temp1 = vec_ld(line_size, pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
907 temp2 = vec_ld(line_size + 16, pixels); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
908 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
909 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
910 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
911 pixelsv2 = temp2; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
912 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
913 else |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
914 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
915 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
916 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
917 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
918 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
919 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
920 pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
921 (vector unsigned short)pixelsv2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
922 temp3 = vec_add(pixelssum1, pixelssum2); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
923 temp3 = vec_sra(temp3, vctwo); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
924 pixelssum1 = vec_add(pixelssum2, vctwo); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
925 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
926 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
927 if (rightside) |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
928 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
929 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
930 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
931 else |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
932 { |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
933 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
934 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
935 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
936 vec_st(blockv, 0, block); |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
937 |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
938 block += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
939 pixels += line_size; |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
940 } |
35cf2f4a0f8c
PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1009
diff
changeset
|
941 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
942 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); |
1009
3b7cc8e4b83f
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
995
diff
changeset
|
943 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
995
edc10966b081
altivec jumbo patch by (Romain Dolbeau <dolbeaur at club-internet dot fr>)
michaelni
parents:
981
diff
changeset
|
944 } |
828
ace3ccd18dd2
Altivec Patch (Mark III) by (Dieter Shirley <dieters at schemasoft dot com>)
michaelni
parents:
638
diff
changeset
|
945 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
946 /* next one assumes that ((line_size % 8) == 0) */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
947 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
948 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
949 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
950 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
951 int j; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
952 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
953 for (j = 0; j < 2; j++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
954 int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
955 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
956 const uint32_t b = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
957 (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
958 uint32_t l0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
959 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
960 uint32_t h0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
961 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
962 uint32_t l1, h1; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
963 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
964 for (i = 0; i < h; i += 2) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
965 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
966 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
967 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
968 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
969 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
970 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
971 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
972 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
973 a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
974 b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
975 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
976 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
977 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
978 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
979 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
980 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
981 } pixels += 4 - line_size * (h + 1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
982 block += 4 - line_size * h; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
983 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
984 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
985 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
986 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
987 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
988 register int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
989 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
990 pixelsv1, pixelsv2, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
991 pixelsavg; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
992 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
993 blockv, temp1, temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
994 register vector unsigned short |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
995 pixelssum1, pixelssum2, temp3; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
996 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
997 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
998 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
999 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1000 temp1 = vec_ld(0, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1001 temp2 = vec_ld(16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1002 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1003 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1004 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1005 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1006 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1007 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1008 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1009 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1010 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1011 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1012 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1013 pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1014 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1015 pixelssum1 = vec_add(pixelssum1, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1016 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1017 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1018 for (i = 0; i < h ; i++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1019 int rightside = ((unsigned long)block & 0x0000000F); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1020 blockv = vec_ld(0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1021 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1022 temp1 = vec_ld(line_size, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1023 temp2 = vec_ld(line_size + 16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1024 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1025 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1026 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1027 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1028 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1029 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1030 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1031 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1032 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1033 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1034 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1035 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1036 pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1037 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1038 temp3 = vec_add(pixelssum1, pixelssum2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1039 temp3 = vec_sra(temp3, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1040 pixelssum1 = vec_add(pixelssum2, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1041 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1042 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1043 if (rightside) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1044 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1045 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1046 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1047 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1048 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1049 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1050 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1051 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1052 vec_st(blockv, 0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1053 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1054 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1055 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1056 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1057 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1058 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1059 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1060 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1061 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1062 /* next one assumes that ((line_size % 16) == 0) */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1063 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1064 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1065 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1066 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1067 int j; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1068 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1069 for (j = 0; j < 4; j++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1070 int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1071 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1072 const uint32_t b = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1073 (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1074 uint32_t l0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1075 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1076 uint32_t h0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1077 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1078 uint32_t l1, h1; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1079 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1080 for (i = 0; i < h; i += 2) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1081 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1082 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1083 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1084 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1085 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1086 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1087 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1088 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1089 a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1090 b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1091 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1092 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1093 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1094 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1095 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1096 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1097 } pixels += 4 - line_size * (h + 1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1098 block += 4 - line_size * h; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1099 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1100 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1101 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1102 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1103 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1104 register int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1105 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1106 pixelsv1, pixelsv2, pixelsv3, pixelsv4; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1107 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1108 blockv, temp1, temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1109 register vector unsigned short |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1110 pixelssum1, pixelssum2, temp3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1111 pixelssum3, pixelssum4, temp4; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1112 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1113 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
1340
09b8fe0f0139
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1277
diff
changeset
|
1114 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1115 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); |
1340
09b8fe0f0139
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1277
diff
changeset
|
1116 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1117 temp1 = vec_ld(0, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1118 temp2 = vec_ld(16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1119 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1120 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1121 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1122 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1123 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1124 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1125 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1126 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1127 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1128 pixelsv3 = vec_mergel(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1129 pixelsv4 = vec_mergel(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1130 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1131 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1132 pixelssum3 = vec_add((vector unsigned short)pixelsv3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1133 (vector unsigned short)pixelsv4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1134 pixelssum3 = vec_add(pixelssum3, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1135 pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1136 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1137 pixelssum1 = vec_add(pixelssum1, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1138 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1139 for (i = 0; i < h ; i++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1140 blockv = vec_ld(0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1141 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1142 temp1 = vec_ld(line_size, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1143 temp2 = vec_ld(line_size + 16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1144 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1145 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1146 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1147 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1148 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1149 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1150 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1151 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1152 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1153 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1154 pixelsv3 = vec_mergel(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1155 pixelsv4 = vec_mergel(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1156 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1157 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1158 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1159 pixelssum4 = vec_add((vector unsigned short)pixelsv3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1160 (vector unsigned short)pixelsv4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1161 pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1162 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1163 temp4 = vec_add(pixelssum3, pixelssum4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1164 temp4 = vec_sra(temp4, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1165 temp3 = vec_add(pixelssum1, pixelssum2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1166 temp3 = vec_sra(temp3, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1167 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1168 pixelssum3 = vec_add(pixelssum4, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1169 pixelssum1 = vec_add(pixelssum2, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1170 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1171 blockv = vec_packsu(temp3, temp4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1172 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1173 vec_st(blockv, 0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1174 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1175 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1176 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1177 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1178 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1179 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1180 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1181 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1182 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1183 /* next one assumes that ((line_size % 16) == 0) */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1184 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1185 { |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1186 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1187 #ifdef ALTIVEC_USE_REFERENCE_C_CODE |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1188 int j; |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1189 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1190 for (j = 0; j < 4; j++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1191 int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1192 const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1193 const uint32_t b = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1194 (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1195 uint32_t l0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1196 (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1197 uint32_t h0 = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1198 ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1199 uint32_t l1, h1; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1200 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1201 for (i = 0; i < h; i += 2) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1202 uint32_t a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1203 uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1204 l1 = (a & 0x03030303UL) + (b & 0x03030303UL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1205 h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1206 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1207 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1208 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1209 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1210 a = (((const struct unaligned_32 *) (pixels))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1211 b = (((const struct unaligned_32 *) (pixels + 1))->l); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1212 l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1213 h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1214 *((uint32_t *) block) = |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1215 h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1216 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1217 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1218 } pixels += 4 - line_size * (h + 1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1219 block += 4 - line_size * h; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1220 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1221 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1222 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1223 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1224 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1225 register int i; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1226 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1227 pixelsv1, pixelsv2, pixelsv3, pixelsv4; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1228 register vector unsigned char |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1229 blockv, temp1, temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1230 register vector unsigned short |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1231 pixelssum1, pixelssum2, temp3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1232 pixelssum3, pixelssum4, temp4; |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1233 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1234 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1235 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); |
1340
09b8fe0f0139
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1277
diff
changeset
|
1236 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1237 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1340
09b8fe0f0139
PPC fixes & clean-up patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1277
diff
changeset
|
1238 |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1239 temp1 = vec_ld(0, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1240 temp2 = vec_ld(16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1241 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1242 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1243 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1244 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1245 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1246 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1247 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1248 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1249 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1250 pixelsv3 = vec_mergel(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1251 pixelsv4 = vec_mergel(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1252 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1253 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1254 pixelssum3 = vec_add((vector unsigned short)pixelsv3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1255 (vector unsigned short)pixelsv4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1256 pixelssum3 = vec_add(pixelssum3, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1257 pixelssum1 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1258 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1259 pixelssum1 = vec_add(pixelssum1, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1260 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1261 for (i = 0; i < h ; i++) { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1262 blockv = vec_ld(0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1263 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1264 temp1 = vec_ld(line_size, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1265 temp2 = vec_ld(line_size + 16, pixels); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1266 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1267 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1268 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1269 pixelsv2 = temp2; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1270 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1271 else |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1272 { |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1273 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1274 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1275 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1276 pixelsv3 = vec_mergel(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1277 pixelsv4 = vec_mergel(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1278 pixelsv1 = vec_mergeh(vczero, pixelsv1); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1279 pixelsv2 = vec_mergeh(vczero, pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1280 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1281 pixelssum4 = vec_add((vector unsigned short)pixelsv3, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1282 (vector unsigned short)pixelsv4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1283 pixelssum2 = vec_add((vector unsigned short)pixelsv1, |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1284 (vector unsigned short)pixelsv2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1285 temp4 = vec_add(pixelssum3, pixelssum4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1286 temp4 = vec_sra(temp4, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1287 temp3 = vec_add(pixelssum1, pixelssum2); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1288 temp3 = vec_sra(temp3, vctwo); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1289 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1290 pixelssum3 = vec_add(pixelssum4, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1291 pixelssum1 = vec_add(pixelssum2, vcone); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1292 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1293 blockv = vec_packsu(temp3, temp4); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1294 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1295 vec_st(blockv, 0, block); |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1296 |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1297 block += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1298 pixels += line_size; |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1299 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1300 |
1352
e8ff4783f188
1) remove TBL support in PPC performance. It's much more useful to use the
michaelni
parents:
1340
diff
changeset
|
1301 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); |
1024
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1302 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1303 } |
9cc1031e1864
More AltiVec MC functions patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
michaelni
parents:
1015
diff
changeset
|
1304 |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1305 int has_altivec(void) |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1306 { |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1307 #ifdef CONFIG_DARWIN |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1308 int sels[2] = {CTL_HW, HW_VECTORUNIT}; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1309 int has_vu = 0; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1310 size_t len = sizeof(has_vu); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1311 int err; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1312 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1313 err = sysctl(sels, 2, &has_vu, &len, NULL, 0); |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1314 |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1315 if (err == 0) return (has_vu != 0); |
1033
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1316 #else /* CONFIG_DARWIN */ |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1317 /* no Darwin, do it the brute-force way */ |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1318 /* this is borrowed from the libmpeg2 library */ |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1319 { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1320 signal (SIGILL, sigill_handler); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1321 if (sigsetjmp (jmpbuf, 1)) { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1322 signal (SIGILL, SIG_DFL); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1323 } else { |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1324 canjump = 1; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1325 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1326 asm volatile ("mtspr 256, %0\n\t" |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1327 "vand %%v0, %%v0, %%v0" |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1328 : |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1329 : "r" (-1)); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1330 |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1331 signal (SIGILL, SIG_DFL); |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1332 return 1; |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1333 } |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1334 } |
b4172ff70d27
Altivec on non darwin systems patch by Romain Dolbeau
bellard
parents:
1024
diff
changeset
|
1335 #endif /* CONFIG_DARWIN */ |
623
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1336 return 0; |
92e99e506920
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot nuigalway dot ie>)
michaelni
parents:
diff
changeset
|
1337 } |