annotate i386/snowdsp_mmx.c @ 3207:33110c1008a4 libavcodec

Add the mmx and sse2 implementations of ff_snow_vertical_compose(). Patch by Robert Edele < yartrebo AH earthlink POIS net > Original thread: Date: Mar 20, 2006 5:54 PM Subject: [Ffmpeg-devel] [PATCH] snow mmx + sse2 part 3
author gpoirier
date Mon, 20 Mar 2006 22:27:59 +0000
parents
children 81cafbc23b8d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3207
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
1 /*
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
2 * MMX and SSE2 optimized snow DSP utils
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
3 * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
4 *
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
5 * This library is free software; you can redistribute it and/or
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
6 * modify it under the terms of the GNU Lesser General Public
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
7 * License as published by the Free Software Foundation; either
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
8 * version 2 of the License, or (at your option) any later version.
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
9 *
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
10 * This library is distributed in the hope that it will be useful,
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
13 * Lesser General Public License for more details.
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
14 *
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
15 * You should have received a copy of the GNU Lesser General Public
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
16 * License along with this library; if not, write to the Free Software
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
18 */
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
19
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
20 #include "../avcodec.h"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
21 #include "../snow.h"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
22 #include "mmx.h"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
23
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
24 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
25 ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
26 ""op" 16(%%"r",%%"REG_d",4), %%"t1" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
27 ""op" 32(%%"r",%%"REG_d",4), %%"t2" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
28 ""op" 48(%%"r",%%"REG_d",4), %%"t3" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
29
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
30 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
31 snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
32
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
33 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
34 snow_vertical_compose_sse2_load_add("paddd",r,t0,t1,t2,t3)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
35
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
36 #define snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
37 "psubd %%"s0", %%"t0" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
38 "psubd %%"s1", %%"t1" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
39 "psubd %%"s2", %%"t2" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
40 "psubd %%"s3", %%"t3" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
41
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
42 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
43 "movdqa %%"s0", (%%"w",%%"REG_d",4) \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
44 "movdqa %%"s1", 16(%%"w",%%"REG_d",4) \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
45 "movdqa %%"s2", 32(%%"w",%%"REG_d",4) \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
46 "movdqa %%"s3", 48(%%"w",%%"REG_d",4) \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
47
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
48 #define snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
49 "psrad $"n", %%"t0" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
50 "psrad $"n", %%"t1" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
51 "psrad $"n", %%"t2" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
52 "psrad $"n", %%"t3" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
53
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
54 #define snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
55 "paddd %%"s0", %%"t0" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
56 "paddd %%"s1", %%"t1" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
57 "paddd %%"s2", %%"t2" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
58 "paddd %%"s3", %%"t3" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
59
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
60 #define snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
61 "pslld $"n", %%"t0" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
62 "pslld $"n", %%"t1" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
63 "pslld $"n", %%"t2" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
64 "pslld $"n", %%"t3" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
65
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
66 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
67 "movdqa %%"s0", %%"t0" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
68 "movdqa %%"s1", %%"t1" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
69 "movdqa %%"s2", %%"t2" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
70 "movdqa %%"s3", %%"t3" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
71
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
72 void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
73 long i = width;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
74
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
75 while(i & 0xF)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
76 {
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
77 i--;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
78 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
79 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
80 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
81 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
82 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
83
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
84 asm volatile (
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
85 "jmp 2f \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
86 "1: \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
87
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
88 "mov %6, %%"REG_a" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
89 "mov %4, %%"REG_b" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
90
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
91 snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
92 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
93 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
94 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
95 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
96
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
97 "pcmpeqd %%xmm1, %%xmm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
98 "pslld $31, %%xmm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
99 "psrld $29, %%xmm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
100 "mov %5, %%"REG_a" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
101
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
102 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
103 snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
104 snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
105 snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
106 snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
107 "mov %3, %%"REG_c" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
108 snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
109 snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
110 snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
111 snow_vertical_compose_sse2_store(REG_b,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
112 "mov %2, %%"REG_a" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
113 snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
114 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
115 snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
116 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
117
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
118 "pcmpeqd %%xmm1, %%xmm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
119 "pslld $31, %%xmm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
120 "psrld $28, %%xmm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
121 "mov %1, %%"REG_b" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
122
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
123 snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
124 snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
125 snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
126 snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
127 snow_vertical_compose_sse2_add(REG_b,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
128 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
129 snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
130 snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
131 snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
132 snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
133 snow_vertical_compose_sse2_store(REG_a,"xmm0","xmm2","xmm4","xmm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
134
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
135 "2: \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
136 "sub $16, %%"REG_d" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
137 "jge 1b \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
138 :"+d"(i)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
139 :
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
140 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
141 "%"REG_a"","%"REG_b"","%"REG_c"");
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
142 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
143
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
144 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
145 ""op" (%%"r",%%"REG_d",4), %%"t0" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
146 ""op" 8(%%"r",%%"REG_d",4), %%"t1" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
147 ""op" 16(%%"r",%%"REG_d",4), %%"t2" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
148 ""op" 24(%%"r",%%"REG_d",4), %%"t3" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
149
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
150 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
151 snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
152
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
153 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
154 snow_vertical_compose_mmx_load_add("paddd",r,t0,t1,t2,t3)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
155
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
156 #define snow_vertical_compose_mmx_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
157 snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
158
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
159 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
160 "movq %%"s0", (%%"w",%%"REG_d",4) \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
161 "movq %%"s1", 8(%%"w",%%"REG_d",4) \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
162 "movq %%"s2", 16(%%"w",%%"REG_d",4) \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
163 "movq %%"s3", 24(%%"w",%%"REG_d",4) \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
164
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
165 #define snow_vertical_compose_mmx_sra(n,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
166 snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
167
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
168 #define snow_vertical_compose_mmx_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
169 snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
170
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
171 #define snow_vertical_compose_mmx_sll(n,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
172 snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
173
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
174 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
175 "movq %%"s0", %%"t0" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
176 "movq %%"s1", %%"t1" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
177 "movq %%"s2", %%"t2" \n\t"\
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
178 "movq %%"s3", %%"t3" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
179
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
180 void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
181 long i = width;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
182 while(i & 0x7)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
183 {
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
184 i--;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
185 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
186 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
187 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
188 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
189 }
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
190
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
191 asm volatile(
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
192 "jmp 2f \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
193 "1: \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
194
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
195 "mov %6, %%"REG_a" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
196 "mov %4, %%"REG_b" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
197
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
198 snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
199 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
200 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
201 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
202 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
203
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
204 "pcmpeqd %%mm1, %%mm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
205 "pslld $31, %%mm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
206 "psrld $29, %%mm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
207 "mov %5, %%"REG_a" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
208
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
209 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
210 snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
211 snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
212 snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
213 snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
214 "mov %3, %%"REG_c" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
215 snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
216 snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
217 snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
218 snow_vertical_compose_mmx_store(REG_b,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
219 "mov %2, %%"REG_a" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
220 snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
221 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
222 snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
223 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
224
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
225 "pcmpeqd %%mm1, %%mm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
226 "pslld $31, %%mm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
227 "psrld $28, %%mm1 \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
228 "mov %1, %%"REG_b" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
229
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
230 snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
231 snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
232 snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
233 snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
234 snow_vertical_compose_mmx_add(REG_b,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
235 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
236 snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
237 snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
238 snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
239 snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
240 snow_vertical_compose_mmx_store(REG_a,"mm0","mm2","mm4","mm6")
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
241
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
242 "2: \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
243 "sub $8, %%"REG_d" \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
244 "jge 1b \n\t"
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
245 :"+d"(i)
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
246 :
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
247 "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
248 "%"REG_a"","%"REG_b"","%"REG_c"");
33110c1008a4 Add the mmx and sse2 implementations of ff_snow_vertical_compose().
gpoirier
parents:
diff changeset
249 }