Mercurial > libavcodec.hg
comparison arm/mpegvideo_armv5te_s.S @ 8359:9281a8a9387a libavcodec
ARM: replace "armv4l" with "arm"
author | mru |
---|---|
date | Wed, 17 Dec 2008 00:54:54 +0000 |
parents | armv4l/mpegvideo_armv5te_s.S@06acc3ab4bdc |
children | 361a5fcb4393 |
comparison
equal
deleted
inserted
replaced
8358:c30b92cf446b | 8359:9281a8a9387a |
---|---|
1 /* | |
2 * Optimization of some functions from mpegvideo.c for armv5te | |
3 * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 #include "config.h" | |
23 #include "asm.S" | |
24 | |
25 /* | |
26 * Special optimized version of dct_unquantize_h263_helper_c, it | |
27 * requires the block to be at least 8 bytes aligned, and may process | |
28 * more elements than requested. But it is guaranteed to never | |
29 * process more than 64 elements provided that count argument is <= 64, | |
30 * so it is safe. This function is optimized for a common distribution | |
31 * of values for nCoeffs (they are mostly multiple of 8 plus one or | |
32 * two extra elements). So this function processes data as 8 elements | |
33 * per loop iteration and contains optional 2 elements processing in | |
34 * the end. | |
35 * | |
36 * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) | |
37 */ | |
38 function ff_dct_unquantize_h263_armv5te, export=1 | |
39 push {r4-r9,lr} | |
40 mov ip, #0 | |
41 subs r3, r3, #2 | |
42 ble 2f | |
43 ldrd r4, [r0, #0] | |
44 1: | |
45 ldrd r6, [r0, #8] | |
46 | |
47 rsbs r9, ip, r4, asr #16 | |
48 addgt r9, r2, #0 | |
49 rsblt r9, r2, #0 | |
50 smlatbne r9, r4, r1, r9 | |
51 | |
52 rsbs lr, ip, r5, asr #16 | |
53 addgt lr, r2, #0 | |
54 rsblt lr, r2, #0 | |
55 smlatbne lr, r5, r1, lr | |
56 | |
57 rsbs r8, ip, r4, asl #16 | |
58 addgt r8, r2, #0 | |
59 rsblt r8, r2, #0 | |
60 smlabbne r4, r4, r1, r8 | |
61 | |
62 rsbs r8, ip, r5, asl #16 | |
63 addgt r8, r2, #0 | |
64 rsblt r8, r2, #0 | |
65 smlabbne r5, r5, r1, r8 | |
66 | |
67 strh r4, [r0], #2 | |
68 strh r9, [r0], #2 | |
69 strh r5, [r0], #2 | |
70 strh lr, [r0], #2 | |
71 | |
72 rsbs r9, ip, r6, asr #16 | |
73 addgt r9, r2, #0 | |
74 rsblt r9, r2, #0 | |
75 smlatbne r9, r6, r1, r9 | |
76 | |
77 rsbs lr, ip, r7, asr #16 | |
78 addgt lr, r2, #0 | |
79 rsblt lr, r2, #0 | |
80 smlatbne lr, r7, r1, lr | |
81 | |
82 rsbs r8, ip, r6, asl #16 | |
83 addgt r8, r2, #0 | |
84 rsblt r8, r2, #0 | |
85 smlabbne r6, r6, r1, r8 | |
86 | |
87 rsbs r8, ip, r7, asl #16 | |
88 addgt r8, r2, #0 | |
89 rsblt r8, r2, #0 | |
90 smlabbne r7, r7, r1, r8 | |
91 | |
92 strh r6, [r0], #2 | |
93 strh r9, [r0], #2 | |
94 strh r7, [r0], #2 | |
95 strh lr, [r0], #2 | |
96 | |
97 subs r3, r3, #8 | |
98 ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ | |
99 bgt 1b | |
100 | |
101 adds r3, r3, #2 | |
102 pople {r4-r9,pc} | |
103 2: | |
104 ldrsh r9, [r0, #0] | |
105 ldrsh lr, [r0, #2] | |
106 mov r8, r2 | |
107 cmp r9, #0 | |
108 rsblt r8, r2, #0 | |
109 smlabbne r9, r9, r1, r8 | |
110 mov r8, r2 | |
111 cmp lr, #0 | |
112 rsblt r8, r2, #0 | |
113 smlabbne lr, lr, r1, r8 | |
114 strh r9, [r0], #2 | |
115 strh lr, [r0], #2 | |
116 pop {r4-r9,pc} | |
117 .endfunc |