Mercurial > libavcodec.hg
comparison armv4l/dsputil_iwmmxt.c @ 2734:aeea63c97878 libavcodec
Better ARM support for mplayer/ffmpeg, ported from atty fork
while playing with some new hardware, I found it's running a forked mplayer
-- and it looks like they're following the GPL.
The maintainer's page is here: http://atty.jp/?Zaurus/mplayer
Unfortunately it's mostly in Japanese, so it's hard to figure out any
details.
Their code looks quite interesting (at least to those of us w/ ARM CPUs).
The patches I've attached are the patches from atty.jp with a couple of
modifications by myself:
- ported to current CVS
- reverted their change of removing SNOW support from ffmpeg
- cleaned up their bswap mess
- removed DOS-style linebreaks from various files
patch by (Bernhard Rosenkraenzer: bero, arklinux org)
author | michael |
---|---|
date | Thu, 26 May 2005 14:32:46 +0000 |
parents | |
children | 930e56f92c57 |
comparison
equal
deleted
inserted
replaced
2733:32336384162e | 2734:aeea63c97878 |
---|---|
1 /* | |
2 * iWMMXt optimized DSP utils | |
3 * Copyright (c) 2004 AGAWA Koji | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
18 */ | |
19 | |
20 #include "../dsputil.h" | |
21 | |
22 #define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt | |
23 #define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); | |
24 #define WAVG2B "wavg2b" | |
25 #include "dsputil_iwmmxt_rnd.h" | |
26 #undef DEF | |
27 #undef SET_RND | |
28 #undef WAVG2B | |
29 | |
30 #define DEF(x, y) x ## _ ## y ##_iwmmxt | |
31 #define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); | |
32 #define WAVG2B "wavg2br" | |
33 #include "dsputil_iwmmxt_rnd.h" | |
34 #undef DEF | |
35 #undef SET_RND | |
36 #undef WAVG2BR | |
37 | |
38 // need scheduling | |
39 #define OP(AVG) \ | |
40 asm volatile ( \ | |
41 /* alignment */ \ | |
42 "and r12, %[pixels], #7 \n\t" \ | |
43 "bic %[pixels], %[pixels], #7 \n\t" \ | |
44 "tmcr wcgr1, r12 \n\t" \ | |
45 \ | |
46 "wldrd wr0, [%[pixels]] \n\t" \ | |
47 "wldrd wr1, [%[pixels], #8] \n\t" \ | |
48 "add %[pixels], %[pixels], %[line_size] \n\t" \ | |
49 "walignr1 wr4, wr0, wr1 \n\t" \ | |
50 \ | |
51 "1: \n\t" \ | |
52 \ | |
53 "wldrd wr2, [%[pixels]] \n\t" \ | |
54 "wldrd wr3, [%[pixels], #8] \n\t" \ | |
55 "add %[pixels], %[pixels], %[line_size] \n\t" \ | |
56 "pld [%[pixels]] \n\t" \ | |
57 "walignr1 wr5, wr2, wr3 \n\t" \ | |
58 AVG " wr6, wr4, wr5 \n\t" \ | |
59 "wstrd wr6, [%[block]] \n\t" \ | |
60 "add %[block], %[block], %[line_size] \n\t" \ | |
61 \ | |
62 "wldrd wr0, [%[pixels]] \n\t" \ | |
63 "wldrd wr1, [%[pixels], #8] \n\t" \ | |
64 "add %[pixels], %[pixels], %[line_size] \n\t" \ | |
65 "walignr1 wr4, wr0, wr1 \n\t" \ | |
66 "pld [%[pixels]] \n\t" \ | |
67 AVG " wr6, wr4, wr5 \n\t" \ | |
68 "wstrd wr6, [%[block]] \n\t" \ | |
69 "add %[block], %[block], %[line_size] \n\t" \ | |
70 \ | |
71 "subs %[h], %[h], #2 \n\t" \ | |
72 "bne 1b \n\t" \ | |
73 : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ | |
74 : [line_size]"r"(line_size) \ | |
75 : "memory", "r12"); | |
76 void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) | |
77 { | |
78 OP("wavg2br"); | |
79 } | |
80 void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) | |
81 { | |
82 OP("wavg2b"); | |
83 } | |
84 #undef OP | |
85 | |
86 void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) | |
87 { | |
88 uint8_t *pixels2 = pixels + line_size; | |
89 | |
90 __asm__ __volatile__ ( | |
91 "mov r12, #4 \n\t" | |
92 "1: \n\t" | |
93 "pld [%[pixels], %[line_size2]] \n\t" | |
94 "pld [%[pixels2], %[line_size2]] \n\t" | |
95 "wldrd wr4, [%[pixels]] \n\t" | |
96 "wldrd wr5, [%[pixels2]] \n\t" | |
97 "pld [%[block], #32] \n\t" | |
98 "wunpckelub wr6, wr4 \n\t" | |
99 "wldrd wr0, [%[block]] \n\t" | |
100 "wunpckehub wr7, wr4 \n\t" | |
101 "wldrd wr1, [%[block], #8] \n\t" | |
102 "wunpckelub wr8, wr5 \n\t" | |
103 "wldrd wr2, [%[block], #16] \n\t" | |
104 "wunpckehub wr9, wr5 \n\t" | |
105 "wldrd wr3, [%[block], #24] \n\t" | |
106 "add %[block], %[block], #32 \n\t" | |
107 "waddhss wr10, wr0, wr6 \n\t" | |
108 "waddhss wr11, wr1, wr7 \n\t" | |
109 "waddhss wr12, wr2, wr8 \n\t" | |
110 "waddhss wr13, wr3, wr9 \n\t" | |
111 "wpackhus wr14, wr10, wr11 \n\t" | |
112 "wpackhus wr15, wr12, wr13 \n\t" | |
113 "wstrd wr14, [%[pixels]] \n\t" | |
114 "add %[pixels], %[pixels], %[line_size2] \n\t" | |
115 "subs r12, r12, #1 \n\t" | |
116 "wstrd wr15, [%[pixels2]] \n\t" | |
117 "add %[pixels2], %[pixels2], %[line_size2] \n\t" | |
118 "bne 1b \n\t" | |
119 : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) | |
120 : [line_size2]"r"(line_size << 1) | |
121 : "cc", "memory", "r12"); | |
122 } | |
123 | |
124 static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
125 { | |
126 return; | |
127 } | |
128 | |
129 void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) | |
130 { | |
131 c->add_pixels_clamped = add_pixels_clamped_iwmmxt; | |
132 | |
133 c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; | |
134 c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; | |
135 c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; | |
136 c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; | |
137 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; | |
138 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; | |
139 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; | |
140 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; | |
141 | |
142 c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; | |
143 c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; | |
144 c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; | |
145 c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; | |
146 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; | |
147 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; | |
148 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; | |
149 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; | |
150 | |
151 c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; | |
152 c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; | |
153 c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; | |
154 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; | |
155 c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; | |
156 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; | |
157 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; | |
158 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; | |
159 | |
160 c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; | |
161 c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; | |
162 c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; | |
163 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; | |
164 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; | |
165 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; | |
166 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; | |
167 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; | |
168 } |