Mercurial > libavcodec.hg
comparison arm/dsputil_neon.S @ 11443:361a5fcb4393 libavcodec
ARM: set size of asm functions in object files
author | mru |
---|---|
date | Tue, 09 Mar 2010 16:17:56 +0000 |
parents | 5c5b864d66e1 |
children | 659f16d04776 |
comparison
equal
deleted
inserted
replaced
11442:fe32d9ba1c86 | 11443:361a5fcb4393 |
---|---|
238 .endm | 238 .endm |
239 | 239 |
240 .macro pixfunc pfx name suf rnd_op args:vararg | 240 .macro pixfunc pfx name suf rnd_op args:vararg |
241 function ff_\pfx\name\suf\()_neon, export=1 | 241 function ff_\pfx\name\suf\()_neon, export=1 |
242 \name \rnd_op \args | 242 \name \rnd_op \args |
243 .endfunc | 243 endfunc |
244 .endm | 244 .endm |
245 | 245 |
246 .macro pixfunc2 pfx name args:vararg | 246 .macro pixfunc2 pfx name args:vararg |
247 pixfunc \pfx \name | 247 pixfunc \pfx \name |
248 pixfunc \pfx \name \args | 248 pixfunc \pfx \name \args |
249 .endm | 249 .endm |
250 | 250 |
251 function ff_put_h264_qpel16_mc00_neon, export=1 | 251 function ff_put_h264_qpel16_mc00_neon, export=1 |
252 mov r3, #16 | 252 mov r3, #16 |
253 .endfunc | 253 endfunc |
254 | 254 |
255 pixfunc put_ pixels16 | 255 pixfunc put_ pixels16 |
256 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | 256 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 |
257 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | 257 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 |
258 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | 258 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 |
259 | 259 |
260 function ff_avg_h264_qpel16_mc00_neon, export=1 | 260 function ff_avg_h264_qpel16_mc00_neon, export=1 |
261 mov r3, #16 | 261 mov r3, #16 |
262 .endfunc | 262 endfunc |
263 | 263 |
264 pixfunc avg_ pixels16,, 1 | 264 pixfunc avg_ pixels16,, 1 |
265 | 265 |
266 function ff_put_h264_qpel8_mc00_neon, export=1 | 266 function ff_put_h264_qpel8_mc00_neon, export=1 |
267 mov r3, #8 | 267 mov r3, #8 |
268 .endfunc | 268 endfunc |
269 | 269 |
270 pixfunc put_ pixels8 | 270 pixfunc put_ pixels8 |
271 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | 271 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 |
272 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | 272 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 |
273 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | 273 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 |
274 | 274 |
275 function ff_avg_h264_qpel8_mc00_neon, export=1 | 275 function ff_avg_h264_qpel8_mc00_neon, export=1 |
276 mov r3, #8 | 276 mov r3, #8 |
277 .endfunc | 277 endfunc |
278 | 278 |
279 pixfunc avg_ pixels8,, 1 | 279 pixfunc avg_ pixels8,, 1 |
280 | 280 |
281 function ff_put_pixels_clamped_neon, export=1 | 281 function ff_put_pixels_clamped_neon, export=1 |
282 vld1.64 {d16-d19}, [r0,:128]! | 282 vld1.64 {d16-d19}, [r0,:128]! |
298 vst1.64 {d4}, [r1,:64], r2 | 298 vst1.64 {d4}, [r1,:64], r2 |
299 vst1.64 {d5}, [r1,:64], r2 | 299 vst1.64 {d5}, [r1,:64], r2 |
300 vst1.64 {d6}, [r1,:64], r2 | 300 vst1.64 {d6}, [r1,:64], r2 |
301 vst1.64 {d7}, [r1,:64], r2 | 301 vst1.64 {d7}, [r1,:64], r2 |
302 bx lr | 302 bx lr |
303 .endfunc | 303 endfunc |
304 | 304 |
305 function ff_put_signed_pixels_clamped_neon, export=1 | 305 function ff_put_signed_pixels_clamped_neon, export=1 |
306 vmov.u8 d31, #128 | 306 vmov.u8 d31, #128 |
307 vld1.64 {d16-d17}, [r0,:128]! | 307 vld1.64 {d16-d17}, [r0,:128]! |
308 vqmovn.s16 d0, q8 | 308 vqmovn.s16 d0, q8 |
335 vadd.u8 d6, d6, d31 | 335 vadd.u8 d6, d6, d31 |
336 vadd.u8 d7, d7, d31 | 336 vadd.u8 d7, d7, d31 |
337 vst1.64 {d6}, [r1,:64], r2 | 337 vst1.64 {d6}, [r1,:64], r2 |
338 vst1.64 {d7}, [r1,:64], r2 | 338 vst1.64 {d7}, [r1,:64], r2 |
339 bx lr | 339 bx lr |
340 .endfunc | 340 endfunc |
341 | 341 |
342 function ff_add_pixels_clamped_neon, export=1 | 342 function ff_add_pixels_clamped_neon, export=1 |
343 mov r3, r1 | 343 mov r3, r1 |
344 vld1.64 {d16}, [r1,:64], r2 | 344 vld1.64 {d16}, [r1,:64], r2 |
345 vld1.64 {d0-d1}, [r0,:128]! | 345 vld1.64 {d0-d1}, [r0,:128]! |
380 vst1.64 {d2}, [r3,:64], r2 | 380 vst1.64 {d2}, [r3,:64], r2 |
381 vqmovun.s16 d6, q3 | 381 vqmovun.s16 d6, q3 |
382 vst1.64 {d4}, [r3,:64], r2 | 382 vst1.64 {d4}, [r3,:64], r2 |
383 vst1.64 {d6}, [r3,:64], r2 | 383 vst1.64 {d6}, [r3,:64], r2 |
384 bx lr | 384 bx lr |
385 .endfunc | 385 endfunc |
386 | 386 |
387 function ff_float_to_int16_neon, export=1 | 387 function ff_float_to_int16_neon, export=1 |
388 subs r2, r2, #8 | 388 subs r2, r2, #8 |
389 vld1.64 {d0-d1}, [r1,:128]! | 389 vld1.64 {d0-d1}, [r1,:128]! |
390 vcvt.s32.f32 q8, q0, #16 | 390 vcvt.s32.f32 q8, q0, #16 |
424 bx lr | 424 bx lr |
425 3: vshrn.s32 d4, q8, #16 | 425 3: vshrn.s32 d4, q8, #16 |
426 vshrn.s32 d5, q9, #16 | 426 vshrn.s32 d5, q9, #16 |
427 vst1.64 {d4-d5}, [r0,:128]! | 427 vst1.64 {d4-d5}, [r0,:128]! |
428 bx lr | 428 bx lr |
429 .endfunc | 429 endfunc |
430 | 430 |
431 function ff_float_to_int16_interleave_neon, export=1 | 431 function ff_float_to_int16_interleave_neon, export=1 |
432 cmp r3, #2 | 432 cmp r3, #2 |
433 ldrlt r1, [r1] | 433 ldrlt r1, [r1] |
434 blt ff_float_to_int16_neon | 434 blt ff_float_to_int16_neon |
717 vld1.64 {d0-d1}, [r4,:128]! | 717 vld1.64 {d0-d1}, [r4,:128]! |
718 vcvt.s32.f32 q0, q0, #16 | 718 vcvt.s32.f32 q0, q0, #16 |
719 vld1.64 {d2-d3}, [r4,:128]! | 719 vld1.64 {d2-d3}, [r4,:128]! |
720 vcvt.s32.f32 q1, q1, #16 | 720 vcvt.s32.f32 q1, q1, #16 |
721 b 6b | 721 b 6b |
722 .endfunc | 722 endfunc |
723 | 723 |
724 function ff_vector_fmul_neon, export=1 | 724 function ff_vector_fmul_neon, export=1 |
725 mov r3, r0 | 725 mov r3, r0 |
726 subs r2, r2, #8 | 726 subs r2, r2, #8 |
727 vld1.64 {d0-d3}, [r0,:128]! | 727 vld1.64 {d0-d3}, [r0,:128]! |
757 vld1.64 {d6-d7}, [r1,:128]! | 757 vld1.64 {d6-d7}, [r1,:128]! |
758 vst1.64 {d18-d19},[r3,:128]! | 758 vst1.64 {d18-d19},[r3,:128]! |
759 vmul.f32 q9, q1, q3 | 759 vmul.f32 q9, q1, q3 |
760 3: vst1.64 {d16-d19},[r3,:128]! | 760 3: vst1.64 {d16-d19},[r3,:128]! |
761 bx lr | 761 bx lr |
762 .endfunc | 762 endfunc |
763 | 763 |
764 function ff_vector_fmul_window_neon, export=1 | 764 function ff_vector_fmul_window_neon, export=1 |
765 VFP vdup.32 q8, d0[0] | 765 VFP vdup.32 q8, d0[0] |
766 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] | 766 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] |
767 push {r4,r5,lr} | 767 push {r4,r5,lr} |
809 vrev64.32 q11, q11 | 809 vrev64.32 q11, q11 |
810 vswp d22, d23 | 810 vswp d22, d23 |
811 vst1.64 {d20,d21},[r0,:128]! | 811 vst1.64 {d20,d21},[r0,:128]! |
812 vst1.64 {d22,d23},[ip,:128], r5 | 812 vst1.64 {d22,d23},[ip,:128], r5 |
813 pop {r4,r5,pc} | 813 pop {r4,r5,pc} |
814 .endfunc | 814 endfunc |
815 | 815 |
816 #if CONFIG_VORBIS_DECODER | 816 #if CONFIG_VORBIS_DECODER |
817 function ff_vorbis_inverse_coupling_neon, export=1 | 817 function ff_vorbis_inverse_coupling_neon, export=1 |
818 vmov.i32 q10, #1<<31 | 818 vmov.i32 q10, #1<<31 |
819 subs r2, r2, #4 | 819 subs r2, r2, #4 |
870 vadd.f32 q1, q0, q2 | 870 vadd.f32 q1, q0, q2 |
871 vsub.f32 q0, q0, q3 | 871 vsub.f32 q0, q0, q3 |
872 vst1.32 {d2-d3}, [r0,:128]! | 872 vst1.32 {d2-d3}, [r0,:128]! |
873 vst1.32 {d0-d1}, [r1,:128]! | 873 vst1.32 {d0-d1}, [r1,:128]! |
874 bx lr | 874 bx lr |
875 .endfunc | 875 endfunc |
876 #endif | 876 #endif |
877 | 877 |
878 function ff_vector_fmul_scalar_neon, export=1 | 878 function ff_vector_fmul_scalar_neon, export=1 |
879 VFP len .req r2 | 879 VFP len .req r2 |
880 NOVFP len .req r3 | 880 NOVFP len .req r3 |
908 vst1.32 {q0},[r0,:128]! | 908 vst1.32 {q0},[r0,:128]! |
909 subs len, len, #4 | 909 subs len, len, #4 |
910 bgt 3b | 910 bgt 3b |
911 bx lr | 911 bx lr |
912 .unreq len | 912 .unreq len |
913 .endfunc | 913 endfunc |
914 | 914 |
915 function ff_vector_fmul_sv_scalar_2_neon, export=1 | 915 function ff_vector_fmul_sv_scalar_2_neon, export=1 |
916 VFP vdup.32 d16, d0[0] | 916 VFP vdup.32 d16, d0[0] |
917 NOVFP vdup.32 d16, r3 | 917 NOVFP vdup.32 d16, r3 |
918 NOVFP ldr r3, [sp] | 918 NOVFP ldr r3, [sp] |
934 vst1.32 {d5},[r0,:64]! | 934 vst1.32 {d5},[r0,:64]! |
935 b 1b | 935 b 1b |
936 2: vst1.32 {d4},[r0,:64]! | 936 2: vst1.32 {d4},[r0,:64]! |
937 vst1.32 {d5},[r0,:64]! | 937 vst1.32 {d5},[r0,:64]! |
938 bx lr | 938 bx lr |
939 .endfunc | 939 endfunc |
940 | 940 |
941 function ff_vector_fmul_sv_scalar_4_neon, export=1 | 941 function ff_vector_fmul_sv_scalar_4_neon, export=1 |
942 VFP vdup.32 q10, d0[0] | 942 VFP vdup.32 q10, d0[0] |
943 NOVFP vdup.32 q10, r3 | 943 NOVFP vdup.32 q10, r3 |
944 NOVFP ldr r3, [sp] | 944 NOVFP ldr r3, [sp] |
973 vmul.f32 q0, q0, q1 | 973 vmul.f32 q0, q0, q1 |
974 vst1.32 {q0},[r0,:128]! | 974 vst1.32 {q0},[r0,:128]! |
975 subs r3, r3, #4 | 975 subs r3, r3, #4 |
976 bgt 3b | 976 bgt 3b |
977 pop {pc} | 977 pop {pc} |
978 .endfunc | 978 endfunc |
979 | 979 |
980 function ff_sv_fmul_scalar_2_neon, export=1 | 980 function ff_sv_fmul_scalar_2_neon, export=1 |
981 VFP len .req r2 | 981 VFP len .req r2 |
982 NOVFP len .req r3 | 982 NOVFP len .req r3 |
983 VFP vdup.32 q8, d0[0] | 983 VFP vdup.32 q8, d0[0] |
996 vst1.32 {q1},[r0,:128]! | 996 vst1.32 {q1},[r0,:128]! |
997 b 1b | 997 b 1b |
998 2: vst1.32 {q1},[r0,:128]! | 998 2: vst1.32 {q1},[r0,:128]! |
999 bx lr | 999 bx lr |
1000 .unreq len | 1000 .unreq len |
1001 .endfunc | 1001 endfunc |
1002 | 1002 |
1003 function ff_sv_fmul_scalar_4_neon, export=1 | 1003 function ff_sv_fmul_scalar_4_neon, export=1 |
1004 VFP len .req r2 | 1004 VFP len .req r2 |
1005 NOVFP len .req r3 | 1005 NOVFP len .req r3 |
1006 VFP vdup.32 q8, d0[0] | 1006 VFP vdup.32 q8, d0[0] |
1011 vst1.32 {q0},[r0,:128]! | 1011 vst1.32 {q0},[r0,:128]! |
1012 subs len, len, #4 | 1012 subs len, len, #4 |
1013 bgt 1b | 1013 bgt 1b |
1014 bx lr | 1014 bx lr |
1015 .unreq len | 1015 .unreq len |
1016 .endfunc | 1016 endfunc |
1017 | 1017 |
1018 function ff_butterflies_float_neon, export=1 | 1018 function ff_butterflies_float_neon, export=1 |
1019 1: vld1.32 {q0},[r0,:128] | 1019 1: vld1.32 {q0},[r0,:128] |
1020 vld1.32 {q1},[r1,:128] | 1020 vld1.32 {q1},[r1,:128] |
1021 vsub.f32 q2, q0, q1 | 1021 vsub.f32 q2, q0, q1 |
1023 vst1.32 {q2},[r1,:128]! | 1023 vst1.32 {q2},[r1,:128]! |
1024 vst1.32 {q1},[r0,:128]! | 1024 vst1.32 {q1},[r0,:128]! |
1025 subs r2, r2, #4 | 1025 subs r2, r2, #4 |
1026 bgt 1b | 1026 bgt 1b |
1027 bx lr | 1027 bx lr |
1028 .endfunc | 1028 endfunc |
1029 | 1029 |
1030 function ff_scalarproduct_float_neon, export=1 | 1030 function ff_scalarproduct_float_neon, export=1 |
1031 vmov.f32 q2, #0.0 | 1031 vmov.f32 q2, #0.0 |
1032 1: vld1.32 {q0},[r0,:128]! | 1032 1: vld1.32 {q0},[r0,:128]! |
1033 vld1.32 {q1},[r1,:128]! | 1033 vld1.32 {q1},[r1,:128]! |
1036 bgt 1b | 1036 bgt 1b |
1037 vadd.f32 d0, d4, d5 | 1037 vadd.f32 d0, d4, d5 |
1038 vpadd.f32 d0, d0, d0 | 1038 vpadd.f32 d0, d0, d0 |
1039 NOVFP vmov.32 r0, d0[0] | 1039 NOVFP vmov.32 r0, d0[0] |
1040 bx lr | 1040 bx lr |
1041 .endfunc | 1041 endfunc |
1042 | 1042 |
1043 function ff_int32_to_float_fmul_scalar_neon, export=1 | 1043 function ff_int32_to_float_fmul_scalar_neon, export=1 |
1044 VFP vdup.32 q0, d0[0] | 1044 VFP vdup.32 q0, d0[0] |
1045 VFP len .req r2 | 1045 VFP len .req r2 |
1046 NOVFP vdup.32 q0, r2 | 1046 NOVFP vdup.32 q0, r2 |
1064 b 1b | 1064 b 1b |
1065 2: vst1.32 {q9}, [r0,:128]! | 1065 2: vst1.32 {q9}, [r0,:128]! |
1066 vst1.32 {q10},[r0,:128]! | 1066 vst1.32 {q10},[r0,:128]! |
1067 bx lr | 1067 bx lr |
1068 .unreq len | 1068 .unreq len |
1069 .endfunc | 1069 endfunc |
1070 | 1070 |
1071 function ff_vector_fmul_reverse_neon, export=1 | 1071 function ff_vector_fmul_reverse_neon, export=1 |
1072 add r2, r2, r3, lsl #2 | 1072 add r2, r2, r3, lsl #2 |
1073 sub r2, r2, #32 | 1073 sub r2, r2, #32 |
1074 mov r12, #-32 | 1074 mov r12, #-32 |
1088 vld1.32 {q2-q3}, [r2,:128], r12 | 1088 vld1.32 {q2-q3}, [r2,:128], r12 |
1089 vst1.32 {q8-q9}, [r0,:128]! | 1089 vst1.32 {q8-q9}, [r0,:128]! |
1090 b 1b | 1090 b 1b |
1091 2: vst1.32 {q8-q9}, [r0,:128]! | 1091 2: vst1.32 {q8-q9}, [r0,:128]! |
1092 bx lr | 1092 bx lr |
1093 .endfunc | 1093 endfunc |
1094 | 1094 |
1095 function ff_vector_fmul_add_neon, export=1 | 1095 function ff_vector_fmul_add_neon, export=1 |
1096 ldr r12, [sp] | 1096 ldr r12, [sp] |
1097 vld1.32 {q0-q1}, [r1,:128]! | 1097 vld1.32 {q0-q1}, [r1,:128]! |
1098 vld1.32 {q8-q9}, [r2,:128]! | 1098 vld1.32 {q8-q9}, [r2,:128]! |
1115 vld1.32 {q2-q3}, [r3,:128]! | 1115 vld1.32 {q2-q3}, [r3,:128]! |
1116 vst1.32 {q12-q13},[r0,:128]! | 1116 vst1.32 {q12-q13},[r0,:128]! |
1117 b 1b | 1117 b 1b |
1118 2: vst1.32 {q12-q13},[r0,:128]! | 1118 2: vst1.32 {q12-q13},[r0,:128]! |
1119 bx lr | 1119 bx lr |
1120 .endfunc | 1120 endfunc |
1121 | 1121 |
1122 function ff_vector_clipf_neon, export=1 | 1122 function ff_vector_clipf_neon, export=1 |
1123 VFP vdup.32 q1, d0[1] | 1123 VFP vdup.32 q1, d0[1] |
1124 VFP vdup.32 q0, d0[0] | 1124 VFP vdup.32 q0, d0[0] |
1125 NOVFP vdup.32 q0, r2 | 1125 NOVFP vdup.32 q0, r2 |
1141 vst1.f32 {q9},[r0,:128]! | 1141 vst1.f32 {q9},[r0,:128]! |
1142 b 1b | 1142 b 1b |
1143 2: vst1.f32 {q8},[r0,:128]! | 1143 2: vst1.f32 {q8},[r0,:128]! |
1144 vst1.f32 {q9},[r0,:128]! | 1144 vst1.f32 {q9},[r0,:128]! |
1145 bx lr | 1145 bx lr |
1146 .endfunc | 1146 endfunc |