Mercurial > libavcodec.hg
comparison arm/dsputil_neon_s.S @ 8492:639169d7fad5 libavcodec
ARM: NEON optimised float_to_int16
author | mru |
---|---|
date | Fri, 26 Dec 2008 19:52:52 +0000 |
parents | 9281a8a9387a |
children | 307b176f91e7 |
comparison
equal
deleted
inserted
replaced
8491:902c43f89d92 | 8492:639169d7fad5 |
---|---|
270 | 270 |
271 pixfunc put_ pixels8 | 271 pixfunc put_ pixels8 |
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | 272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 |
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | 273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 |
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | 274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 |
275 | |
276 function ff_float_to_int16_neon, export=1 | |
277 subs r2, r2, #8 | |
278 vld1.64 {d0-d1}, [r1,:128]! | |
279 vcvt.s32.f32 q8, q0, #16 | |
280 vld1.64 {d2-d3}, [r1,:128]! | |
281 vcvt.s32.f32 q9, q1, #16 | |
282 beq 3f | |
283 bics ip, r2, #15 | |
284 beq 2f | |
285 1: subs ip, ip, #16 | |
286 vshrn.s32 d4, q8, #16 | |
287 vld1.64 {d0-d1}, [r1,:128]! | |
288 vcvt.s32.f32 q0, q0, #16 | |
289 vshrn.s32 d5, q9, #16 | |
290 vld1.64 {d2-d3}, [r1,:128]! | |
291 vcvt.s32.f32 q1, q1, #16 | |
292 vshrn.s32 d6, q0, #16 | |
293 vst1.64 {d4-d5}, [r0,:128]! | |
294 vshrn.s32 d7, q1, #16 | |
295 vld1.64 {d16-d17},[r1,:128]! | |
296 vcvt.s32.f32 q8, q8, #16 | |
297 vld1.64 {d18-d19},[r1,:128]! | |
298 vcvt.s32.f32 q9, q9, #16 | |
299 vst1.64 {d6-d7}, [r0,:128]! | |
300 bne 1b | |
301 ands r2, r2, #15 | |
302 beq 3f | |
303 2: vld1.64 {d0-d1}, [r1,:128]! | |
304 vshrn.s32 d4, q8, #16 | |
305 vcvt.s32.f32 q0, q0, #16 | |
306 vld1.64 {d2-d3}, [r1,:128]! | |
307 vshrn.s32 d5, q9, #16 | |
308 vcvt.s32.f32 q1, q1, #16 | |
309 vshrn.s32 d6, q0, #16 | |
310 vst1.64 {d4-d5}, [r0,:128]! | |
311 vshrn.s32 d7, q1, #16 | |
312 vst1.64 {d6-d7}, [r0,:128]! | |
313 bx lr | |
314 3: vshrn.s32 d4, q8, #16 | |
315 vshrn.s32 d5, q9, #16 | |
316 vst1.64 {d4-d5}, [r0,:128]! | |
317 bx lr | |
318 .endfunc | |
319 | |
320 function ff_float_to_int16_interleave_neon, export=1 | |
321 cmp r3, #2 | |
322 ldrlt r1, [r1] | |
323 blt ff_float_to_int16_neon | |
324 bne 4f | |
325 | |
326 ldr r3, [r1] | |
327 ldr r1, [r1, #4] | |
328 | |
329 subs r2, r2, #8 | |
330 vld1.64 {d0-d1}, [r3,:128]! | |
331 vcvt.s32.f32 q8, q0, #16 | |
332 vld1.64 {d2-d3}, [r3,:128]! | |
333 vcvt.s32.f32 q9, q1, #16 | |
334 vld1.64 {d20-d21},[r1,:128]! | |
335 vcvt.s32.f32 q10, q10, #16 | |
336 vld1.64 {d22-d23},[r1,:128]! | |
337 vcvt.s32.f32 q11, q11, #16 | |
338 beq 3f | |
339 bics ip, r2, #15 | |
340 beq 2f | |
341 1: subs ip, ip, #16 | |
342 vld1.64 {d0-d1}, [r3,:128]! | |
343 vcvt.s32.f32 q0, q0, #16 | |
344 vsri.32 q10, q8, #16 | |
345 vld1.64 {d2-d3}, [r3,:128]! | |
346 vcvt.s32.f32 q1, q1, #16 | |
347 vld1.64 {d24-d25},[r1,:128]! | |
348 vcvt.s32.f32 q12, q12, #16 | |
349 vld1.64 {d26-d27},[r1,:128]! | |
350 vsri.32 q11, q9, #16 | |
351 vst1.64 {d20-d21},[r0,:128]! | |
352 vcvt.s32.f32 q13, q13, #16 | |
353 vst1.64 {d22-d23},[r0,:128]! | |
354 vsri.32 q12, q0, #16 | |
355 vld1.64 {d16-d17},[r3,:128]! | |
356 vsri.32 q13, q1, #16 | |
357 vst1.64 {d24-d25},[r0,:128]! | |
358 vcvt.s32.f32 q8, q8, #16 | |
359 vld1.64 {d18-d19},[r3,:128]! | |
360 vcvt.s32.f32 q9, q9, #16 | |
361 vld1.64 {d20-d21},[r1,:128]! | |
362 vcvt.s32.f32 q10, q10, #16 | |
363 vld1.64 {d22-d23},[r1,:128]! | |
364 vcvt.s32.f32 q11, q11, #16 | |
365 vst1.64 {d26-d27},[r0,:128]! | |
366 bne 1b | |
367 ands r2, r2, #15 | |
368 beq 3f | |
369 2: vsri.32 q10, q8, #16 | |
370 vld1.64 {d0-d1}, [r3,:128]! | |
371 vcvt.s32.f32 q0, q0, #16 | |
372 vld1.64 {d2-d3}, [r3,:128]! | |
373 vcvt.s32.f32 q1, q1, #16 | |
374 vld1.64 {d24-d25},[r1,:128]! | |
375 vcvt.s32.f32 q12, q12, #16 | |
376 vsri.32 q11, q9, #16 | |
377 vld1.64 {d26-d27},[r1,:128]! | |
378 vcvt.s32.f32 q13, q13, #16 | |
379 vst1.64 {d20-d21},[r0,:128]! | |
380 vsri.32 q12, q0, #16 | |
381 vst1.64 {d22-d23},[r0,:128]! | |
382 vsri.32 q13, q1, #16 | |
383 vst1.64 {d24-d27},[r0,:128]! | |
384 bx lr | |
385 3: vsri.32 q10, q8, #16 | |
386 vsri.32 q11, q9, #16 | |
387 vst1.64 {d20-d23},[r0,:128]! | |
388 bx lr | |
389 | |
390 4: push {r4-r8,lr} | |
391 cmp r3, #4 | |
392 lsl ip, r3, #1 | |
393 blt 4f | |
394 | |
395 @ 4 channels | |
396 5: ldmia r1!, {r4-r7} | |
397 mov lr, r2 | |
398 mov r8, r0 | |
399 vld1.64 {d16-d17},[r4,:128]! | |
400 vcvt.s32.f32 q8, q8, #16 | |
401 vld1.64 {d18-d19},[r5,:128]! | |
402 vcvt.s32.f32 q9, q9, #16 | |
403 vld1.64 {d20-d21},[r6,:128]! | |
404 vcvt.s32.f32 q10, q10, #16 | |
405 vld1.64 {d22-d23},[r7,:128]! | |
406 vcvt.s32.f32 q11, q11, #16 | |
407 6: subs lr, lr, #8 | |
408 vld1.64 {d0-d1}, [r4,:128]! | |
409 vcvt.s32.f32 q0, q0, #16 | |
410 vsri.32 q9, q8, #16 | |
411 vld1.64 {d2-d3}, [r5,:128]! | |
412 vcvt.s32.f32 q1, q1, #16 | |
413 vsri.32 q11, q10, #16 | |
414 vld1.64 {d4-d5}, [r6,:128]! | |
415 vcvt.s32.f32 q2, q2, #16 | |
416 vzip.32 d18, d22 | |
417 vld1.64 {d6-d7}, [r7,:128]! | |
418 vcvt.s32.f32 q3, q3, #16 | |
419 vzip.32 d19, d23 | |
420 vst1.64 {d18}, [r8], ip | |
421 vsri.32 q1, q0, #16 | |
422 vst1.64 {d22}, [r8], ip | |
423 vsri.32 q3, q2, #16 | |
424 vst1.64 {d19}, [r8], ip | |
425 vzip.32 d2, d6 | |
426 vst1.64 {d23}, [r8], ip | |
427 vzip.32 d3, d7 | |
428 beq 7f | |
429 vld1.64 {d16-d17},[r4,:128]! | |
430 vcvt.s32.f32 q8, q8, #16 | |
431 vst1.64 {d2}, [r8], ip | |
432 vld1.64 {d18-d19},[r5,:128]! | |
433 vcvt.s32.f32 q9, q9, #16 | |
434 vst1.64 {d6}, [r8], ip | |
435 vld1.64 {d20-d21},[r6,:128]! | |
436 vcvt.s32.f32 q10, q10, #16 | |
437 vst1.64 {d3}, [r8], ip | |
438 vld1.64 {d22-d23},[r7,:128]! | |
439 vcvt.s32.f32 q11, q11, #16 | |
440 vst1.64 {d7}, [r8], ip | |
441 b 6b | |
442 7: vst1.64 {d2}, [r8], ip | |
443 vst1.64 {d6}, [r8], ip | |
444 vst1.64 {d3}, [r8], ip | |
445 vst1.64 {d7}, [r8], ip | |
446 subs r3, r3, #4 | |
447 popeq {r4-r8,pc} | |
448 cmp r3, #4 | |
449 add r0, r0, #8 | |
450 bge 5b | |
451 | |
452 @ 2 channels | |
453 4: cmp r3, #2 | |
454 blt 4f | |
455 ldmia r1!, {r4-r5} | |
456 mov lr, r2 | |
457 mov r8, r0 | |
458 tst lr, #8 | |
459 vld1.64 {d16-d17},[r4,:128]! | |
460 vcvt.s32.f32 q8, q8, #16 | |
461 vld1.64 {d18-d19},[r5,:128]! | |
462 vcvt.s32.f32 q9, q9, #16 | |
463 vld1.64 {d20-d21},[r4,:128]! | |
464 vcvt.s32.f32 q10, q10, #16 | |
465 vld1.64 {d22-d23},[r5,:128]! | |
466 vcvt.s32.f32 q11, q11, #16 | |
467 beq 6f | |
468 subs lr, lr, #8 | |
469 beq 7f | |
470 vsri.32 d18, d16, #16 | |
471 vsri.32 d19, d17, #16 | |
472 vld1.64 {d16-d17},[r4,:128]! | |
473 vcvt.s32.f32 q8, q8, #16 | |
474 vst1.32 {d18[0]}, [r8], ip | |
475 vsri.32 d22, d20, #16 | |
476 vst1.32 {d18[1]}, [r8], ip | |
477 vsri.32 d23, d21, #16 | |
478 vst1.32 {d19[0]}, [r8], ip | |
479 vst1.32 {d19[1]}, [r8], ip | |
480 vld1.64 {d18-d19},[r5,:128]! | |
481 vcvt.s32.f32 q9, q9, #16 | |
482 vst1.32 {d22[0]}, [r8], ip | |
483 vst1.32 {d22[1]}, [r8], ip | |
484 vld1.64 {d20-d21},[r4,:128]! | |
485 vcvt.s32.f32 q10, q10, #16 | |
486 vst1.32 {d23[0]}, [r8], ip | |
487 vst1.32 {d23[1]}, [r8], ip | |
488 vld1.64 {d22-d23},[r5,:128]! | |
489 vcvt.s32.f32 q11, q11, #16 | |
490 6: subs lr, lr, #16 | |
491 vld1.64 {d0-d1}, [r4,:128]! | |
492 vcvt.s32.f32 q0, q0, #16 | |
493 vsri.32 d18, d16, #16 | |
494 vld1.64 {d2-d3}, [r5,:128]! | |
495 vcvt.s32.f32 q1, q1, #16 | |
496 vsri.32 d19, d17, #16 | |
497 vld1.64 {d4-d5}, [r4,:128]! | |
498 vcvt.s32.f32 q2, q2, #16 | |
499 vld1.64 {d6-d7}, [r5,:128]! | |
500 vcvt.s32.f32 q3, q3, #16 | |
501 vst1.32 {d18[0]}, [r8], ip | |
502 vsri.32 d22, d20, #16 | |
503 vst1.32 {d18[1]}, [r8], ip | |
504 vsri.32 d23, d21, #16 | |
505 vst1.32 {d19[0]}, [r8], ip | |
506 vsri.32 d2, d0, #16 | |
507 vst1.32 {d19[1]}, [r8], ip | |
508 vsri.32 d3, d1, #16 | |
509 vst1.32 {d22[0]}, [r8], ip | |
510 vsri.32 d6, d4, #16 | |
511 vst1.32 {d22[1]}, [r8], ip | |
512 vsri.32 d7, d5, #16 | |
513 vst1.32 {d23[0]}, [r8], ip | |
514 vst1.32 {d23[1]}, [r8], ip | |
515 beq 6f | |
516 vld1.64 {d16-d17},[r4,:128]! | |
517 vcvt.s32.f32 q8, q8, #16 | |
518 vst1.32 {d2[0]}, [r8], ip | |
519 vst1.32 {d2[1]}, [r8], ip | |
520 vld1.64 {d18-d19},[r5,:128]! | |
521 vcvt.s32.f32 q9, q9, #16 | |
522 vst1.32 {d3[0]}, [r8], ip | |
523 vst1.32 {d3[1]}, [r8], ip | |
524 vld1.64 {d20-d21},[r4,:128]! | |
525 vcvt.s32.f32 q10, q10, #16 | |
526 vst1.32 {d6[0]}, [r8], ip | |
527 vst1.32 {d6[1]}, [r8], ip | |
528 vld1.64 {d22-d23},[r5,:128]! | |
529 vcvt.s32.f32 q11, q11, #16 | |
530 vst1.32 {d7[0]}, [r8], ip | |
531 vst1.32 {d7[1]}, [r8], ip | |
532 bgt 6b | |
533 6: vst1.32 {d2[0]}, [r8], ip | |
534 vst1.32 {d2[1]}, [r8], ip | |
535 vst1.32 {d3[0]}, [r8], ip | |
536 vst1.32 {d3[1]}, [r8], ip | |
537 vst1.32 {d6[0]}, [r8], ip | |
538 vst1.32 {d6[1]}, [r8], ip | |
539 vst1.32 {d7[0]}, [r8], ip | |
540 vst1.32 {d7[1]}, [r8], ip | |
541 b 8f | |
542 7: vsri.32 d18, d16, #16 | |
543 vsri.32 d19, d17, #16 | |
544 vst1.32 {d18[0]}, [r8], ip | |
545 vsri.32 d22, d20, #16 | |
546 vst1.32 {d18[1]}, [r8], ip | |
547 vsri.32 d23, d21, #16 | |
548 vst1.32 {d19[0]}, [r8], ip | |
549 vst1.32 {d19[1]}, [r8], ip | |
550 vst1.32 {d22[0]}, [r8], ip | |
551 vst1.32 {d22[1]}, [r8], ip | |
552 vst1.32 {d23[0]}, [r8], ip | |
553 vst1.32 {d23[1]}, [r8], ip | |
554 8: subs r3, r3, #2 | |
555 add r0, r0, #4 | |
556 popeq {r4-r8,pc} | |
557 | |
558 @ 1 channel | |
559 4: ldr r4, [r1],#4 | |
560 tst r2, #8 | |
561 mov lr, r2 | |
562 mov r5, r0 | |
563 vld1.64 {d0-d1}, [r4,:128]! | |
564 vcvt.s32.f32 q0, q0, #16 | |
565 vld1.64 {d2-d3}, [r4,:128]! | |
566 vcvt.s32.f32 q1, q1, #16 | |
567 bne 8f | |
568 6: subs lr, lr, #16 | |
569 vld1.64 {d4-d5}, [r4,:128]! | |
570 vcvt.s32.f32 q2, q2, #16 | |
571 vld1.64 {d6-d7}, [r4,:128]! | |
572 vcvt.s32.f32 q3, q3, #16 | |
573 vst1.16 {d0[1]}, [r5,:16], ip | |
574 vst1.16 {d0[3]}, [r5,:16], ip | |
575 vst1.16 {d1[1]}, [r5,:16], ip | |
576 vst1.16 {d1[3]}, [r5,:16], ip | |
577 vst1.16 {d2[1]}, [r5,:16], ip | |
578 vst1.16 {d2[3]}, [r5,:16], ip | |
579 vst1.16 {d3[1]}, [r5,:16], ip | |
580 vst1.16 {d3[3]}, [r5,:16], ip | |
581 beq 7f | |
582 vld1.64 {d0-d1}, [r4,:128]! | |
583 vcvt.s32.f32 q0, q0, #16 | |
584 vld1.64 {d2-d3}, [r4,:128]! | |
585 vcvt.s32.f32 q1, q1, #16 | |
586 7: vst1.16 {d4[1]}, [r5,:16], ip | |
587 vst1.16 {d4[3]}, [r5,:16], ip | |
588 vst1.16 {d5[1]}, [r5,:16], ip | |
589 vst1.16 {d5[3]}, [r5,:16], ip | |
590 vst1.16 {d6[1]}, [r5,:16], ip | |
591 vst1.16 {d6[3]}, [r5,:16], ip | |
592 vst1.16 {d7[1]}, [r5,:16], ip | |
593 vst1.16 {d7[3]}, [r5,:16], ip | |
594 bgt 6b | |
595 pop {r4-r8,pc} | |
596 8: subs lr, lr, #8 | |
597 vst1.16 {d0[1]}, [r5,:16], ip | |
598 vst1.16 {d0[3]}, [r5,:16], ip | |
599 vst1.16 {d1[1]}, [r5,:16], ip | |
600 vst1.16 {d1[3]}, [r5,:16], ip | |
601 vst1.16 {d2[1]}, [r5,:16], ip | |
602 vst1.16 {d2[3]}, [r5,:16], ip | |
603 vst1.16 {d3[1]}, [r5,:16], ip | |
604 vst1.16 {d3[3]}, [r5,:16], ip | |
605 popeq {r4-r8,pc} | |
606 vld1.64 {d0-d1}, [r4,:128]! | |
607 vcvt.s32.f32 q0, q0, #16 | |
608 vld1.64 {d2-d3}, [r4,:128]! | |
609 vcvt.s32.f32 q1, q1, #16 | |
610 b 6b | |
611 .endfunc |