annotate bfin/vp3_idct_bfin.S @ 8404:60b6a780100b libavcodec

Port x264 deblocking code to libavcodec. This includes SSE2 luma deblocking code and both MMXEXT and SSE2 luma intra deblocking code for H.264 decoding. This assembly is available under --enable-gpl and speeds decoding of Cathedral by 7%.
author darkshikari
date Fri, 19 Dec 2008 13:45:13 +0000
parents 78aa57eba353
children 8327c5b4df9b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5776
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
1 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
2 * vp3_idct BlackFin
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
3 *
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
4 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
5 *
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
6 * This file is part of FFmpeg.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
7 *
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
8 * FFmpeg is free software; you can redistribute it and/or
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
9 * modify it under the terms of the GNU Lesser General Public
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
10 * License as published by the Free Software Foundation; either
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
11 * version 2.1 of the License, or (at your option) any later version.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
12 *
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
13 * FFmpeg is distributed in the hope that it will be useful,
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
16 * Lesser General Public License for more details.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
17 *
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
18 * You should have received a copy of the GNU Lesser General Public
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
19 * License along with FFmpeg; if not, write to the Free Software
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
21 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
22 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
23 This blackfin DSP code implements an 8x8 inverse type II DCT.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
24
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
25 Prototype : void ff_bfin_vp3_idct(DCTELEM *in)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
26
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
27 Registers Used : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M2, L0-L3, P0-P5, LC0.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
28
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
29 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
30
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
31 #include "config_bfin.h"
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
32
6362
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
33 #ifdef __FDPIC__
5776
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
34 .section .l1.data.B,"aw",@progbits
6362
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
35 #else
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
36 .data
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
37 #endif
5776
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
38
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
39 .align 4;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
40 coefs:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
41 .short 0x5a82; // C4
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
42 .short 0x5a82; // C4
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
43 .short 0x30FC; //cos(3pi/8) C6
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
44 .short 0x7642; //cos(pi/8) C2
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
45 .short 0x18F9; //cos(7pi/16)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
46 .short 0x7D8A; //cos(pi/16)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
47 .short 0x471D; //cos(5pi/16)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
48 .short 0x6A6E; //cos(3pi/16)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
49 .short 0x18F9; //cos(7pi/16)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
50 .short 0x7D8A; //cos(pi/16)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
51
6362
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
52 #ifdef __FDPIC__
5776
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
53 .section .l1.data.A
6362
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
54 #endif
5776
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
55
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
56 vtmp: .space 256
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
57
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
58 #define TMP0 FP-8
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
59 #define TMP1 FP-12
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
60 #define TMP2 FP-16
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
61
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
62
6362
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
63 .text
5776
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
64 DEFUN(vp3_idct,mL1,
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
65 (DCTELEM *block)):
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
66
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
67 /********************** Function Prologue *********************************/
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
68 link 16;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
69 [--SP] = (R7:4, P5:3); // Push the registers onto the stack.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
70 B0 = R0; // Pointer to Input matrix
6362
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
71 RELOC(R1, P3, coefs); // Pointer to Coefficients
78aa57eba353 FLAT objects cannot have multiple sections, so using the L1 attributes breaks
diego
parents: 5776
diff changeset
72 RELOC(R2, P3, vtmp); // Pointer to Temporary matrix
5776
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
73 B3 = R1;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
74 B2 = R2;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
75 L3 = 20; // L3 is used for making the coefficient array
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
76 // circular.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
77 // MUST BE RESTORED TO ZERO at function exit.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
78 M1 = 16 (X); // All these registers are initialized for
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
79 M3 = 8(X); // modifying address offsets.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
80
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
81 I0 = B0; // I0 points to Input Element (0, 0).
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
82 I2 = B0; // I2 points to Input Element (0, 0).
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
83 I2 += M3 || R0.H = W[I0];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
84 // Element 0 is read into R0.H
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
85 I1 = I2; // I1 points to input Element (0, 6).
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
86 I1 += 4 || R0.L = W[I2++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
87 // I2 points to input Element (0, 4).
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
88 // Element 4 is read into R0.L.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
89 P2 = 8 (X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
90 P3 = 32 (X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
91 P4 = -32 (X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
92 P5 = 98 (X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
93 R7 = 0x8000(Z);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
94 I3 = B3; // I3 points to Coefficients
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
95 P0 = B2; // P0 points to array Element (0, 0) of temp
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
96 P1 = B2;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
97 R7 = [I3++] || [TMP2]=R7; // Coefficient C4 is read into R7.H and R7.L.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
98 MNOP;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
99 NOP;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
100
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
101 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
102 * A1 = Y0 * cos(pi/4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
103 * A0 = Y0 * cos(pi/4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
104 * A1 = A1 + Y4 * cos(pi/4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
105 * A0 = A0 - Y4 * cos(pi/4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
106 * load:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
107 * R1=(Y2,Y6)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
108 * R7=(C2,C6)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
109 * res:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
110 * R3=Y0, R2=Y4
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
111 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
112 A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+= 4 || R1.L=W[I1++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
113 R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
114
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
115 LSETUP (.0, .1) LC0 = P2; // perform 8 1d idcts
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
116
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
117 P2 = 112 (X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
118 P1 = P1 + P2; // P1 points to element (7, 0) of temp buffer.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
119 P2 = -94(X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
120
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
121 .0:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
122 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
123 * A1 = Y2 * cos(3pi/8)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
124 * A0 = Y2 * cos(pi/8)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
125 * A1 = A1 - Y6 * cos(pi/8)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
126 * A0 = A0 + Y6 * cos(3pi/8)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
127 * R5 = (Y1,Y7)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
128 * R7 = (C1,C7)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
129 * res:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
130 * R1=Y2, R0=Y6
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
131 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
132 A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
133 R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
134 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
135 * Y0 = Y0 + Y6.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
136 * Y4 = Y4 + Y2.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
137 * Y2 = Y4 - Y2.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
138 * Y6 = Y0 - Y6.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
139 * R3 is saved
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
140 * R6.l=Y3
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
141 * note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
142 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
143 R3=R3+R0, R0=R3-R0;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
144 R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
145 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
146 * Compute the odd portion (1,3,5,7) even is done.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
147 *
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
148 * Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
149 * Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
150 * Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
151 * Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
152 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
153 // R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
154 A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
155 A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
156 A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
157 R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
158 A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
159 A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
160 A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
161 R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
162 // R3=Y1, R2=Y7, R7=Y5, R6=Y3
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
163
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
164 /* Transpose write column. */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
165 R5.H=R4+R2 (RND12); // Y0=Y0+Y7
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
166 R5.L=R4-R2 (RND12) || R4 = [TMP1]; // Y7=Y7-Y0
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
167 R2.H=R1+R7 (RND12) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
168 R2.L=R1-R7 (RND12) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
169 R5.H=R0-R3 (RND12) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
170 R5.L=R0+R3 (RND12) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
171 R3.H=R4-R6 (RND12) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
172 R3.L=R4+R6 (RND12) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
173
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
174 /* pipeline loop start, + drain Y3, Y4 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
175 A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
176 .1: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
177
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
178
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
179
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
180 I0 = B2; // I0 points to Input Element (0, 0)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
181 I2 = B2; // I2 points to Input Element (0, 0)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
182 I2 += M3 || R0.H = W[I0];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
183 // Y0 is read in R0.H
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
184 I1 = I2; // I1 points to input Element (0, 6)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
185 I1 += 4 || R0.L = W[I2++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
186 // I2 points to input Element (0, 4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
187 // Y4 is read in R0.L
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
188 P2 = 8 (X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
189 I3 = B3; // I3 points to Coefficients
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
190 P0 = B0; // P0 points to array Element (0, 0) for writing
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
191 // output
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
192 P1 = B0;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
193 R7 = [I3++]; // R7.H = C4 and R7.L = C4
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
194 NOP;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
195
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
196 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
197 * A1 = Y0 * cos(pi/4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
198 * A0 = Y0 * cos(pi/4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
199 * A1 = A1 + Y4 * cos(pi/4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
200 * A0 = A0 - Y4 * cos(pi/4)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
201 * load:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
202 * R1=(Y2,Y6)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
203 * R7=(C2,C6)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
204 * res:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
205 * R3=Y0, R2=Y4
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
206 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
207 A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || I0+=4 || R1.L=W[I1++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
208 R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
209
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
210 LSETUP (.2, .3) LC0 = P2; // peform 8 1d idcts
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
211 P2 = 112 (X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
212 P1 = P1 + P2;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
213 P2 = -94(X);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
214
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
215 .2:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
216 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
217 * A1 = Y2 * cos(3pi/8)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
218 * A0 = Y2 * cos(pi/8)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
219 * A1 = A1 - Y6 * cos(pi/8)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
220 * A0 = A0 + Y6 * cos(3pi/8)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
221 * R5 = (Y1,Y7)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
222 * R7 = (C1,C7)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
223 * res:
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
224 * R1=Y2, R0=Y6
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
225 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
226 A1=R7.L*R1.H, A0=R7.H*R1.H (IS) || I0+=4 || R5.H=W[I0];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
227 R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS) || R5.L=W[I1--] || R7=[I3++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
228 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
229 * Y0 = Y0 + Y6.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
230 * Y4 = Y4 + Y2.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
231 * Y2 = Y4 - Y2.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
232 * Y6 = Y0 - Y6.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
233 * R3 is saved
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
234 * R6.l=Y3
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
235 * note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
236 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
237 R3=R3+R0, R0=R3-R0;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
238 R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
239 /*
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
240 * Compute the odd portion (1,3,5,7) even is done.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
241 *
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
242 * Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
243 * Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
244 * Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
245 * Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
246 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
247 // R5=(Y1,Y7) R6=(Y5,Y3) // R7=(C1,C7)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
248 A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || [TMP1]=R2 || R6.H=W[I2--];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
249 A1-=R7.H*R5.L, A0+=R7.L*R5.L (IS) || I0-=4 || R7=[I3++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
250 A1+=R7.H*R6.H, A0+=R7.L*R6.H (IS) || I0+=M1; // R7=(C3,C5)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
251 R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
252 A1 =R7.L*R5.H, A0 =R7.H*R5.H (IS) || R4=[TMP0];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
253 A1+=R7.H*R5.L, A0-=R7.L*R5.L (IS) || I1+=M1 || R7=[I3++]; // R7=(C1,C7)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
254 A1+=R7.L*R6.H, A0-=R7.H*R6.H (IS);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
255 R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
256 // R3=Y1, R2=Y7, R7=Y5, R6=Y3
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
257
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
258 /* Transpose write column. */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
259 R5.H=R4+R2 (RND20); // Y0=Y0+Y7
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
260 R5.L=R4-R2 (RND20) || R4 = [TMP1]; // Y7=Y7-Y0
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
261 R5=R5>>>2(v);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
262 R2.H=R1+R7 (RND20) || W[P0++P3]=R5.H; // Y2=Y2+Y5 st Y0
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
263 R2.L=R1-R7 (RND20) || W[P1++P4]=R5.L || R7=[I3++]; // Y5=Y2-Y5 st Y7
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
264 R2=R2>>>2(v);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
265 R5.H=R0-R3 (RND20) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
266 R5.L=R0+R3 (RND20) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
267 R5=R5>>>2(v);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
268 R3.H=R4-R6 (RND20) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
269 R3.L=R4+R6 (RND20) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
270 R3=R3>>>2(v);
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
271 /* pipeline loop start, + drain Y3, Y4 */
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
272 A1=R7.H*R0.H, A0=R7.H*R0.H (IS) || W[P0++P2]= R3.H || R1.H = W[I0--];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
273 .3: R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
274
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
275 L3 = 0;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
276 (R7:4,P5:3)=[SP++];
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
277 unlink;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
278 RTS;
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
279 DEFUN_END(vp3_idct)
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
280
73ec16dbcbc1 blackfin optimized vp3 transform and infastructure for idct
mhoffman
parents:
diff changeset
281