changeset 4765:85298e8c55c4 libavcodec

bfin dsputils, basic pixel operations sads, diffs, motion compensation and standard IEEE 8x8 block transforms patch by Marc Hoffman, mmh pleasantst com
author diego
date Sun, 01 Apr 2007 22:28:45 +0000
parents da0598df2e53
children f43b50149387
files Makefile bfin/config_bfin.h bfin/dsputil_bfin.c bfin/fdct_bfin.S bfin/idct_bfin.S bfin/pixels_bfin.S
diffstat 6 files changed, 1677 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Sun Apr 01 22:20:51 2007 +0000
+++ b/Makefile	Sun Apr 01 22:28:45 2007 +0000
@@ -394,6 +394,11 @@
 
 OBJS-$(TARGET_ARCH_BFIN)               += bfin/dsputil_bfin.o \
 
+ASM_OBJS-$(TARGET_ARCH_BFIN)           += bfin/pixels_bfin.o \
+                                          bfin/idct_bfin.o   \
+                                          bfin/fdct_bfin.o   \
+                                          bfin/xidct.o       \
+
 EXTRALIBS := -L$(BUILD_ROOT)/libavutil -lavutil$(BUILDSUF) $(EXTRALIBS)
 
 NAME=avcodec
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bfin/config_bfin.h	Sun Apr 01 22:28:45 2007 +0000
@@ -0,0 +1,46 @@
+/*
+ * config_bfin.h
+ *
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+/*
+   low level assembler interface wrapper
+
+DEFUN(put_pixels_clamped,mL1,
+        (DCTELEM *block, uint8_t *dest, int line_size)):
+
+      body
+
+      rts;
+*/
+#ifndef DEFUN
+
+#ifndef mL1
+#define mL1 .l1.text
+#endif
+#define mL3 .text
+
+#define DEFUN(fname,where,interface) \
+        .section where;              \
+        .global _ff_bfin_ ## fname ; \
+        .align 8;                    \
+        _ff_bfin_ ## fname
+
+#endif
+
--- a/bfin/dsputil_bfin.c	Sun Apr 01 22:20:51 2007 +0000
+++ b/bfin/dsputil_bfin.c	Sun Apr 01 22:28:45 2007 +0000
@@ -1,5 +1,8 @@
 /*
- * Copyright (c) 2006 Michael Benjamin
+ * BlackFin DSPUTILS
+ *
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ * Copyright (c) 2006 Michael Benjamin <michael.benjamin@analog.com>
  *
  * This file is part of FFmpeg.
  *
@@ -18,38 +21,290 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <unistd.h>
+#include <bits/bfin_sram.h>
 #include "../avcodec.h"
 #include "../dsputil.h"
 
-static int sad8x8_bfin( void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h )
+#define USE_L1CODE
+
+#ifdef USE_L1CODE
+#define L1CODE __attribute__ ((l1_text))
+#else
+#define L1CODE
+#endif
+int off;
+
+
+extern void ff_bfin_idct (DCTELEM *block) L1CODE;
+extern void ff_bfin_fdct (DCTELEM *block) L1CODE;
+extern void ff_bfin_add_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) L1CODE;
+extern void ff_bfin_put_pixels_clamped (DCTELEM *block, uint8_t *dest, int line_size) L1CODE;
+extern void ff_bfin_diff_pixels (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)  L1CODE;
+extern void ff_bfin_get_pixels  (DCTELEM *restrict block, const uint8_t *pixels, int line_size) L1CODE;
+extern int  ff_bfin_pix_norm1  (uint8_t * pix, int line_size) L1CODE;
+extern int  ff_bfin_z_sad8x8   (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) L1CODE;
+extern int  ff_bfin_z_sad16x16 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h) L1CODE;
+
+extern void ff_bfin_z_put_pixels16_xy2     (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) L1CODE;
+extern void ff_bfin_z_put_pixels8_xy2      (uint8_t *block, const uint8_t *s0, int dest_size, int line_size, int h) L1CODE;
+extern void ff_bfin_put_pixels16_xy2_nornd (uint8_t *block, const uint8_t *s0, int line_size, int h) L1CODE;
+extern void ff_bfin_put_pixels8_xy2_nornd  (uint8_t *block, const uint8_t *s0, int line_size, int h) L1CODE;
+
+
+extern int  ff_bfin_pix_sum (uint8_t *p, int stride) L1CODE;
+
+extern void ff_bfin_put_pixels8uc        (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) L1CODE;
+extern void ff_bfin_put_pixels16uc       (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int dest_size, int line_size, int h) L1CODE;
+extern void ff_bfin_put_pixels8uc_nornd  (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) L1CODE;
+extern void ff_bfin_put_pixels16uc_nornd (uint8_t *block, const uint8_t *s0, const uint8_t *s1, int line_size, int h) L1CODE;
+
+extern int ff_bfin_sse4  (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
+extern int ff_bfin_sse8  (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
+extern int ff_bfin_sse16 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) L1CODE;
+
+
+#if 0
+void pblk (uint8_t *p, int w, int h, int s)
+{
+    int i,j;
+    av_log (0,0,"0x%08x:\n", p);
+    for (i = 0;i<h;i++) {
+        for (j=0;j<w;j++)
+            av_log (0,0,"%3d ", p[j]);
+        p+=s;
+        av_log (0,0,"\n");
+    }
+    av_log (0,0,"\n");
+}
+#endif
+
+static void bfin_idct_add (uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_bfin_idct (block);
+    ff_bfin_add_pixels_clamped (block, dest, line_size);
+}
+
+static void bfin_idct_put (uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_bfin_idct (block);
+    ff_bfin_put_pixels_clamped (block, dest, line_size);
+}
+
+
+static void bfin_clear_blocks (DCTELEM *blocks)
+{
+    // This is just a simple memset.
+    //
+    asm("P0=192; "
+        "I0=%0;  "
+        "R0=0;   "
+        "LSETUP(clear_blocks_blkfn_lab,clear_blocks_blkfn_lab)LC0=P0;"
+        "clear_blocks_blkfn_lab:"
+        "[I0++]=R0;"
+        ::"a" (blocks):"P0","I0","R0");
+}
+
+
+
+static void bfin_put_pixels8 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels8uc (block, pixels, pixels, line_size, line_size, h);
+}
+
+static void bfin_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels8uc (block, pixels, pixels+1, line_size, line_size, h);
+}
+
+static void bfin_put_pixels8_y2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels8uc (block, pixels, pixels+line_size, line_size, line_size, h);
+}
+
+static void bfin_put_pixels8_xy2 (uint8_t *block, const uint8_t *s0, int line_size, int h)
+{
+    ff_bfin_z_put_pixels8_xy2 (block,s0,line_size, line_size, h);
+}
+
+static void bfin_put_pixels16 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
-    int sum;
-    __asm__ __volatile__ (
-    "P0 = %1;" // blk1
-    "P1 = %2;" // blk2
-    "P2 = %3;\n" // h
-    "I0 = P0;"
-    "I1 = P1;\n"
-    "A0 = 0;"
-    "A1 = 0;\n"
-    "M0 = P2;\n"
-    "P3 = 32;\n"
-    "LSETUP (sad8x8LoopBegin, sad8x8LoopEnd) LC0=P3;\n"
-    "sad8x8LoopBegin:\n"
-    "  DISALGNEXCPT || R0 = [I0] || R2 = [I1];\n"
-    "  DISALGNEXCPT || R1 = [I0++] || R3 = [I1++];\n"
-    "sad8x8LoopEnd:\n"
-    "  SAA ( R1:0 , R3:2 );\n"
-    "R3 = A1.L + A1.H, R2 = A0.L + A0.H;\n"
-    "%0 = R2 + R3 (S);\n"
-    : "=&d" (sum)
-    : "m"(blk1), "m"(blk2), "m"(h)
-    : "P0","P1","P2","I0","I1","A0","A1","R0","R1","R2","R3");
-    return sum;
+    ff_bfin_put_pixels16uc (block, pixels, pixels, line_size, line_size, h);
+}
+
+static void bfin_put_pixels16_x2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels16uc (block, pixels, pixels+1, line_size, line_size, h);
+}
+
+static void bfin_put_pixels16_y2 (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels16uc (block, pixels, pixels+line_size, line_size, line_size, h);
+}
+
+static void bfin_put_pixels16_xy2 (uint8_t *block, const uint8_t *s0, int line_size, int h)
+{
+    ff_bfin_z_put_pixels16_xy2 (block,s0,line_size, line_size, h);
+}
+
+void bfin_put_pixels8_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels8uc_nornd (block, pixels, pixels, line_size, h);
+}
+
+static void bfin_put_pixels8_x2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels8uc_nornd (block, pixels, pixels+1, line_size, h);
+}
+
+static void bfin_put_pixels8_y2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels8uc_nornd (block, pixels, pixels+line_size, line_size, h);
+}
+
+
+void bfin_put_pixels16_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels16uc_nornd (block, pixels, pixels, line_size, h);
+}
+
+static void bfin_put_pixels16_x2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels16uc_nornd (block, pixels, pixels+1, line_size, h);
+}
+
+static void bfin_put_pixels16_y2_nornd (uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    ff_bfin_put_pixels16uc_nornd (block, pixels, pixels+line_size, line_size, h);
+}
+
+static int bfin_pix_abs16 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
+{
+    return ff_bfin_z_sad16x16 (blk1,blk2,line_size,line_size,h);
 }
 
+static uint8_t vtmp_blk[256] __attribute__((l1_data_B));
+
+static int bfin_pix_abs16_x2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
+{
+    ff_bfin_put_pixels16uc (vtmp_blk, blk2, blk2+1, 16, line_size, h);
+    return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
+}
+
+static int bfin_pix_abs16_y2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
+{
+    ff_bfin_put_pixels16uc (vtmp_blk, blk2, blk2+line_size, 16, line_size, h);
+    return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
+}
+
+static int bfin_pix_abs16_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
+{
+    ff_bfin_z_put_pixels16_xy2 (vtmp_blk, blk2, 16, line_size, h);
+    return ff_bfin_z_sad16x16 (blk1, vtmp_blk, line_size, 16, h);
+}
+
+static int bfin_pix_abs8 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
+{
+    return ff_bfin_z_sad8x8 (blk1,blk2,line_size,line_size, h);
+}
+
+static int bfin_pix_abs8_x2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
+{
+    ff_bfin_put_pixels8uc (vtmp_blk, blk2, blk2+1, 8, line_size, h);
+    return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
+}
+
+static int bfin_pix_abs8_y2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
+{
+    ff_bfin_put_pixels8uc (vtmp_blk, blk2, blk2+line_size, 8, line_size, h);
+    return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
+}
+
+static int bfin_pix_abs8_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_size, int h)
+{
+    ff_bfin_z_put_pixels8_xy2 (vtmp_blk, blk2, 8, line_size, h);
+    return ff_bfin_z_sad8x8 (blk1, vtmp_blk, line_size, 8, h);
+}
+
+
+/*
+  decoder optimization
+  start on 2/11 100 frames of 352x240@25 compiled with no optimization -g debugging
+  9.824s ~ 2.44x off
+  6.360s ~ 1.58x off with -O2
+  5.740s ~ 1.43x off with idcts
+
+  2.64s    2/20 same sman.mp4 decode only
+
+*/
+
 void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
 {
-    c->pix_abs[1][0] = sad8x8_bfin;
-    c->sad[1] = sad8x8_bfin;
+    c->get_pixels         = ff_bfin_get_pixels;
+    c->diff_pixels        = ff_bfin_diff_pixels;
+    c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
+    c->add_pixels_clamped = ff_bfin_add_pixels_clamped;
+
+    c->clear_blocks       = bfin_clear_blocks;
+    c->pix_sum            = ff_bfin_pix_sum;
+    c->pix_norm1          = ff_bfin_pix_norm1;
+
+    c->sad[0]             = bfin_pix_abs16;
+    c->sad[1]             = bfin_pix_abs8;
+
+    /* TODO [0] 16  [1] 8 */
+    c->pix_abs[0][0] = bfin_pix_abs16;
+    c->pix_abs[0][1] = bfin_pix_abs16_x2;
+    c->pix_abs[0][2] = bfin_pix_abs16_y2;
+    c->pix_abs[0][3] = bfin_pix_abs16_xy2;
+
+    c->pix_abs[1][0] = bfin_pix_abs8;
+    c->pix_abs[1][1] = bfin_pix_abs8_x2;
+    c->pix_abs[1][2] = bfin_pix_abs8_y2;
+    c->pix_abs[1][3] = bfin_pix_abs8_xy2;
+
+
+    c->sse[0] = ff_bfin_sse16;
+    c->sse[1] = ff_bfin_sse8;
+    c->sse[2] = ff_bfin_sse4;
+
+
+    /**
+     * Halfpel motion compensation with rounding (a+b+1)>>1.
+     * This is an array[4][4] of motion compensation functions for 4
+     * horizontal blocksizes (8,16) and the 4 halfpel positions
+     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+     * @param block destination where the result is stored
+     * @param pixels source
+     * @param line_size number of bytes in a horizontal line of block
+     * @param h height
+     */
+
+    c->put_pixels_tab[0][0] = bfin_put_pixels16;
+    c->put_pixels_tab[0][1] = bfin_put_pixels16_x2;
+    c->put_pixels_tab[0][2] = bfin_put_pixels16_y2;
+    c->put_pixels_tab[0][3] = bfin_put_pixels16_xy2;
+
+    c->put_pixels_tab[1][0] = bfin_put_pixels8;
+    c->put_pixels_tab[1][1] = bfin_put_pixels8_x2;
+    c->put_pixels_tab[1][2] = bfin_put_pixels8_y2;
+    c->put_pixels_tab[1][3] = bfin_put_pixels8_xy2;
+
+    c->put_no_rnd_pixels_tab[1][0] = bfin_put_pixels8_nornd;
+    c->put_no_rnd_pixels_tab[1][1] = bfin_put_pixels8_x2_nornd;
+    c->put_no_rnd_pixels_tab[1][2] = bfin_put_pixels8_y2_nornd;
+    c->put_no_rnd_pixels_tab[1][3] = ff_bfin_put_pixels8_xy2_nornd;
+
+    c->put_no_rnd_pixels_tab[0][0] = bfin_put_pixels16_nornd;
+    c->put_no_rnd_pixels_tab[0][1] = bfin_put_pixels16_x2_nornd;
+    c->put_no_rnd_pixels_tab[0][2] = bfin_put_pixels16_y2_nornd;
+    c->put_no_rnd_pixels_tab[0][3] = ff_bfin_put_pixels16_xy2_nornd;
+
+    c->fdct               = ff_bfin_fdct;
+    c->idct               = ff_bfin_idct;
+    c->idct_add           = bfin_idct_add;
+    c->idct_put           = bfin_idct_put;
 }
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bfin/fdct_bfin.S	Sun Apr 01 22:28:45 2007 +0000
@@ -0,0 +1,324 @@
+/*
+ * fdct BlackFin
+ *
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+/*
+  void ff_bfin_fdct (DCTELEM *buf);
+
+  This implementation works only for 8x8 input. The range of input
+  must be -256 to 255 i.e. 8bit input represented in a 16bit data
+  word. The original data must be sign extended into the 16bit data
+  words.
+
+
+   Chen factorization of
+
+           8
+   X(m) = sum (x(n) * cos ((2n+1)*m*pi/16))
+          n=0
+
+                                             C4
+ 0  --*-------------*0+7---*-----*0+3-------*-*-------------------> 0
+       \           /        \   /            X S4,S4
+ 1  --*-\---------/-*1+6---*-\-/-*1+2-------*-*-------------------> 4
+         \       /            \              -C4     C3
+ 2  --*---\-----/---*2+5---*-/-\-*1-2---------------*-*-----------> 2
+           \   /            /   \                    X S3,-S3
+ 3  --*-----\-/-----*3+4---*-----*0-3---------------*-*-----------> 6
+             /                                  C7   C3
+ 4  --*-----/-\-----*3-4------------*-*4+5--*-----*---------------> 1
+           /   \            -C4      X       \   /S7    C3
+ 5  --*---/-----\---*2-5---*-*------*=*4-5----\-/------*-*--------> 5
+         /       \          X S4,S4            /        X S3,-S3
+ 6  --*-/---------\-*1-6---*-*------*=*7-6----/-\------*-*--------> 3
+       /           \        C4       X       /   \-S7   C3
+    --*-------------*0-7------------*-*7+6--*-----*---------------> 7
+                                                C7
+
+Notation
+        Cn = cos(n*pi/8) used throughout the code.
+
+
+  Registers used:
+        R0, R1, R2, R3, R4, R5, R6,R7,  P0, P1, P2, P3, P4, P5, A0, A1.
+  Other registers used:
+        I0, I1, I2, I3, B0, B2, B3, M0, M1, L3 registers and LC0.
+
+  Input - r0 - pointer to start of DCTELEM *block
+
+  Output - The DCT output coefficients in the DCTELEM *block
+
+  Register constraint:
+               This code is called from jpeg_encode.
+               R6, R5, R4 if modified should be stored and restored.
+
+
+  Performance: (Timer version 0.6.33)
+               Code Size : 240 Bytes.
+               Memory Required :
+               Input Matrix : 8 * 8 * 2 Bytes.
+               Coefficients : 16 Bytes
+               Temporary matrix: 8 * 8 * 2 Bytes.
+               Cycle Count :26+{18+8*(14+2S)}*2  where S -> Stalls
+                            (7.45 c/pel)
+        -----------------------------------------
+        |  Size  |  Forward DCT  |  Inverse DCT |
+        -----------------------------------------
+        |  8x8   |   284 Cycles  |  311 Cycles  |
+        -----------------------------------------
+
+Ck = int16(cos(k/16*pi)*32767+.5)/2
+#define C4 23170
+#define C3 13623
+#define C6 6270
+#define C7 3196
+
+Sk = int16(sin(k/16*pi)*32767+.5)/2
+#define S4 11585
+#define S3 9102
+#define S6 15137
+#define S7 16069
+
+the coefficients are ordered as follows:
+short dct_coef[]
+  C4,S4,
+  C6,S6,
+  C7,S7,
+  S3,C3,
+
+-----------------------------------------------------------
+FFMPEG conformance testing results
+-----------------------------------------------------------
+dct-test: modified with the following
+            dct_error("BFINfdct", 0, ff_bfin_fdct, fdct, test);
+produces the following output:
+
+root:/u/ffmpeg/bhead/libavcodec> ./dct-test
+ffmpeg DCT/IDCT test
+
+    2  -131    -6   -48   -36    33   -83    24
+   34    52   -24   -15     5    92    57   143
+  -67   -43    -1    74   -16     5   -71    32
+  -78   106    92   -34   -38    81    20   -18
+    7   -62    40     2   -15    90   -62   -83
+  -83     1  -104   -13    43   -19     7    11
+  -63    31    12   -29    83    72    21    10
+  -17   -63   -15    73    50   -91   159   -14
+DCT BFINfdct: err_inf=2 err2=0.16425938 syserr=0.00795000 maxout=2098 blockSumErr=27
+DCT BFINfdct: 92.1 kdct/s
+root:/u/ffmpeg/bhead/libavcodec>
+
+*/
+
+#include "config_bfin.h"
+
+.section .l1.data.B,"aw",@progbits
+.align 4;
+dct_coeff:
+.short 0x5a82, 0x2d41, 0x187e, 0x3b21, 0x0c7c, 0x3ec5, 0x238e, 0x3537;
+
+.section .l1.data.A,"aw",@progbits
+.align 4
+vtmp:   .space 128
+
+DEFUN(fdct,mL1,
+        (DCTELEM *block)):
+    [--SP] = (R7:4, P5:3);          // Push the registers onto the stack.
+
+    b0 = r0;
+    r0 = [P3+dct_coeff@GOT17M4];
+    b3 = r0;
+    r0 = [P3+vtmp@GOT17M4];
+    b2 = r0;
+
+    L3 = 16;                        // L3 is set to 16 to make the coefficient
+                                    // array Circular.
+
+
+//----------------------------------------------------------------------------
+
+/*
+ * I0, I1, and I2 registers are used to read the input data. I3 register is used
+ * to read the coefficients. P0 and P1 registers are used for writing the output
+ * data.
+ */
+    M0 = 12 (X);                    // All these initializations are used in the
+    M1 = 16 (X);                    // modification of address offsets.
+
+    M2 = 128 (X);
+
+    P2 = 16;
+    P3 = 32 (X);
+    P4 = -110 (X);
+    P5 = -62 (X);
+    P0 = 2(X);
+
+
+    // Prescale the input to get the correct precision.
+    i0=b0;
+    i1=b0;
+
+    lsetup (.0, .1) LC0 = P3;
+    r0=[i0++];
+.0:     r1=r0<<3 (v) || r0=[i0++] ;
+.1:     [i1++]=r1;
+
+        /*
+         * B0 points to the "in" buffer.
+         * B2 points to "temp" buffer in the first iteration.
+         */
+
+    lsetup (.2, .3) LC0 = P0;
+.2:
+        I0 = B0;                        // I0 points to Input Element (0, 0).
+        I1 = B0;                        // Element 1 and 0 is read in R0.
+        I1 += M0  || R0 = [I0++];       // I1 points to Input Element (0, 6).
+        I2 = I1;                        // Element 6 is read into R3.H.
+        I2 -= 4   || R3.H = W[I1++];    // I2 points to Input Element (0, 4).
+
+        I3 = B3;                        // I3 points to Coefficients.
+        P0 = B2;                        // P0 points to temporary array Element
+                                        //   (0, 0).
+        P1 = B2;                        // P1 points to temporary array.
+        R7 = [P1++P2] || R2 = [I2++];   // P1 points to temporary array
+                                        //   Element (1, 0).
+                                        // R7 is a dummy read. X4,X5
+                                        //   are read into R2.
+        R3.L = W[I1--];                 // X7 is read into R3.L.
+        R1.H = W[I0++];                 // X2 is read into R1.H.
+
+
+        /*
+         *  X0 = (X0 + X7) / 2.
+         *  X1 = (X1 + X6) / 2.
+         *  X6 = (X1 - X6) / 2.
+         *  X7 = (X0 - X7) / 2.
+         *  It reads the data 3 in R1.L.
+         */
+
+        R0 = R0 +|+ R3, R3 = R0 -|- R3 || R1.L = W[I0++] || NOP;
+
+        /*
+         *       X2 = (X2 + X5) / 2.
+         *       X3 = (X3 + X4) / 2.
+         *       X4 = (X3 - X4) / 2.
+         *       X5 = (X2 - X5) / 2.
+         *          R7 = C4 = cos(4*pi/16)
+         */
+
+        R1 = R1 +|+ R2, R2 = R1 -|- R2 (CO) || NOP      ||  R7 = [I3++];
+
+        /*
+         * At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
+         * R3 has (6,7).
+         * Where the notation (x, y) represents uper/lower half pairs.
+         */
+
+        /*
+         *       X0 = X0 + X3.
+         *       X1 = X1 + X2.
+         *       X2 = X1 - X2.
+         *       X3 = X0 - X3.
+         */
+        R0 = R0 +|+ R1, R1 = R0 -|- R1;
+
+        lsetup (.row0, .row1) LC1 = P2 >> 1;  // 1d dct, loops 8x
+.row0:
+
+        /*
+         *       This is part 2 computation continued.....
+         *       A1 =      X6 * cos(pi/4)
+         *       A0 =      X6 * cos(pi/4)
+         *       A1 = A1 - X5 * cos(pi/4)
+         *       A0 = A0 + X5 * cos(pi/4).
+         *       The instruction W[I0] = R3.L is used for packing it to R2.L.
+         */
+
+        A1=R3.H*R7.l,         A0=R3.H*R7.l            ||  I1+=M1 || W[I0] = R3.L;
+        R4.H=(A1-=R2.L*R7.l), R4.L=(A0+=R2.L*R7.l)    ||  I2+=M0 || NOP;
+
+        /*       R0 = (X1,X0)      R1 = (X2,X3)       R4 = (X5, X6). */
+
+        /*
+         *       A1 =      X0 * cos(pi/4)
+         *       A0 =      X0 * cos(pi/4)
+         *       A1 = A1 - X1 * cos(pi/4)
+         *       A0 = A0 + X1 * cos(pi/4)
+         *       R7 = (C2,C6)
+         */
+        A1=R0.L*R7.h,        A0=R0.L*R7.h             || NOP       || R3.H=W[I1++];
+        R5.H=(A1-=R0.H*R7.h),R5.L=(A0+=R0.H*R7.h)     || R7=[I3++] || NOP;
+
+        /*
+         *       A1 =      X2 * cos(3pi/8)
+         *       A0 =      X3 * cos(3pi/8)
+         *       A1 = A1 + X3 * cos(pi/8)
+         *       A0 = A0 - X2 * cos(pi/8)
+         *         R3 = cos(pi/4)
+         *         R7 = (cos(7pi/8),cos(pi/8))
+         *       X4 = X4 + X5.
+         *       X5 = X4 - X5.
+         *       X6 = X7 - X6.
+         *       X7 = X7 + X6.
+         */
+        A1=R1.H*R7.L,        A0=R1.L*R7.L             || W[P0++P3]=R5.L || R2.L=W[I0];
+        R2=R2+|+R4,          R4=R2-|-R4               || I0+=4          || R3.L=W[I1--];
+        R6.H=(A1+=R1.L*R7.H),R6.L=(A0 -= R1.H * R7.H) || I0+=4          || R7=[I3++];
+
+        /*         R2 = (X4, X7)        R4 = (X5,X6)      R5 = (X1, X0)       R6 = (X2,X3). */
+
+        /*
+         *       A1 =      X4 * cos(7pi/16)
+         *       A0 =      X7 * cos(7pi/16)
+         *       A1 = A1 + X7 * cos(pi/16)
+         *       A0 = A0 - X4 * cos(pi/16)
+         */
+
+        A1=R2.H*R7.L,        A0=R2.L*R7.L             || W[P0++P3]=R6.H || R0=[I0++];
+        R2.H=(A1+=R2.L*R7.H),R2.L=(A0-=R2.H*R7.H)     || W[P0++P3]=R5.H || R7=[I3++];
+
+        /*
+         *       A1 =      X5 * cos(3pi/16)
+         *       A0 =      X6 * cos(3pi/16)
+         *       A1 = A1 + X6 * cos(5pi/16)
+         *       A0 = A0 - X5 * cos(5pi/16)
+         *       The output values are written.
+         */
+
+        A1=R4.H*R7.H,        A0=R4.L*R7.H             || W[P0++P2]=R6.L || R1.H=W[I0++];
+        R4.H=(A1+=R4.L*R7.L),R4.L=(A0-=R4.H*R7.L)     || W[P0++P4]=R2.L || R1.L=W[I0++];
+
+
+        /* Beginning of next stage, **pipelined** + drain and store the
+           rest of the column store. */
+
+        R0=R0+|+R3,R3=R0-|-R3                         || W[P1++P3]=R2.H || R2=[I2++];
+        R1=R1+|+R2,R2=R1-|-R2 (CO)                    || W[P1++P3]=R4.L || R7=[I3++];
+.row1:  R0=R0+|+R1,R1=R0-|-R1                         || W[P1++P5]=R4.H || NOP;
+
+        // Exchange input with output.
+        B1 = B0;
+        B0 = B2;
+.3:     B2 = B1;
+
+        L3=0;
+        (r7:4,p5:3) = [sp++];
+        RTS;
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bfin/idct_bfin.S	Sun Apr 01 22:28:45 2007 +0000
@@ -0,0 +1,297 @@
+/*
+ * idct BlackFin
+ *
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+/*
+   This blackfin DSP code implements an 8x8 inverse type II DCT.
+
+Prototype       : void ff_bfin_idct(DCTELEM *in)
+
+Registers Used  : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M2, L0-L3, P0-P5, LC0.
+
+Performance     :
+                    Code Size   : 498 Bytes.
+                    Cycle Count : 417 Cycles
+
+
+-----------------------------------------------------------
+FFMPEG conformance testing results
+-----------------------------------------------------------
+
+dct-test: modified with the following
+            dct_error("BFINidct", 1, ff_bfin_idct, idct, test);
+produces the following output
+
+root:/u/ffmpeg/bhead/libavcodec> ./dct-test -i
+ffmpeg DCT/IDCT test
+
+    8    15    -2    21    24    17     0    10
+    2   -10    -5    -5    -3     7   -14    -3
+    2   -13   -10   -19    18    -6     6    -2
+    9     4    16    -3     9    12    10    15
+   15    -9    -2    10     1    16     0   -15
+  -15     5     7     3    13     0    13    20
+   -6   -15    24     9   -18     1     9   -22
+   -8    25    23     2    -7     0    30    13
+IDCT BFINidct: err_inf=1 err2=0.01002344 syserr=0.00150000 maxout=266 blockSumErr=64
+IDCT BFINidct: 88.3 kdct/s
+
+*/
+
+#include "config_bfin.h"
+
+.section .l1.data.B,"aw",@progbits
+
+.align 4;
+coefs:
+.short 0x5a82;           //  C4
+.short 0x5a82;           //  C4
+.short 0x30FC;           //cos(3pi/8)  C6
+.short 0x7642;           //cos(pi/8)   C2
+.short 0x18F9;           //cos(7pi/16)
+.short 0x7D8A;           //cos(pi/16)
+.short 0x471D;           //cos(5pi/16)
+.short 0x6A6E;           //cos(3pi/16)
+.short 0x18F9;           //cos(7pi/16)
+.short 0x7D8A;           //cos(pi/16)
+
+.section .l1.data.A
+
+vtmp: .space 256
+
+#define TMP0 FP-8
+#define TMP1 FP-12
+#define TMP2 FP-16
+
+
+DEFUN(idct,mL1,
+        (DCTELEM *block)):
+
+/********************** Function Prologue *********************************/
+    link 16;
+    [--SP] = (R7:4, P5:3);   // Push the registers onto the stack.
+    B0 = R0;                 // Pointer to Input matrix
+    R1 = [P3+coefs@GOT17M4]; // Pointer to Coefficients
+    R2 = [P3+vtmp@GOT17M4];  // Pointer to Temporary matrix
+    B3 = R1;
+    B2 = R2;
+    L3 = 20;                // L3 is used for making the coefficient array
+                            // circular.
+                            // MUST BE RESTORED TO ZERO at function exit.
+    M1 = 16 (X);            // All these registers are initialized for
+    M3 = 8(X);              // modifying address offsets.
+
+    I0 = B0;                // I0 points to Input Element (0, 0).
+    I2 = B0;                // I2 points to Input Element (0, 0).
+    I2 += M3 || R0.H = W[I0];
+                            // Element 0 is read into R0.H
+    I1 = I2;                // I1 points to input Element (0, 6).
+    I1 += 4  || R0.L = W[I2++];
+                            // I2 points to input Element (0, 4).
+                            // Element 4 is read into R0.L.
+    P2 = 8 (X);
+    P3 = 32 (X);
+    P4 = -32 (X);
+    P5 = 98 (X);
+    R7 = 0x8000(Z);
+    I3 = B3;                // I3 points to Coefficients
+    P0 = B2;                // P0 points to array Element (0, 0) of temp
+    P1 = B2;
+    R7 = [I3++] || [TMP2]=R7;            // Coefficient C4 is read into R7.H and R7.L.
+    MNOP;
+    NOP;
+
+    /*
+     *   A1 =      Y0 * cos(pi/4)
+     *   A0 =      Y0 * cos(pi/4)
+     *   A1 = A1 + Y4 * cos(pi/4)
+     *   A0 = A0 - Y4 * cos(pi/4)
+     *   load:
+     *     R1=(Y2,Y6)
+     *     R7=(C2,C6)
+     *   res:
+     *     R3=Y0, R2=Y4
+     */
+    A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || I0+= 4       || R1.L=W[I1++];
+    R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];
+
+    LSETUP (.0, .1) LC0 = P2; // perform 8 1d idcts
+
+    P2 = 112 (X);
+    P1 = P1 + P2;           // P1 points to element (7, 0) of temp buffer.
+    P2 = -94(X);
+
+.0:
+       /*
+        *   A1 =      Y2 * cos(3pi/8)
+        *   A0 =      Y2 * cos(pi/8)
+        *   A1 = A1 - Y6 * cos(pi/8)
+        *   A0 = A0 + Y6 * cos(3pi/8)
+        *      R5 = (Y1,Y7)
+        *      R7 = (C1,C7)
+        *   res:
+        *      R1=Y2, R0=Y6
+        */
+        A1=R7.L*R1.H,       A0=R7.H*R1.H (IS)        || I0+=4        || R5.H=W[I0];
+        R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS)  || R5.L=W[I1--] || R7=[I3++];
+        /*
+        *   Y0 = Y0 + Y6.
+        *   Y4 = Y4 + Y2.
+        *   Y2 = Y4 - Y2.
+        *   Y6 = Y0 - Y6.
+        *     R3 is saved
+        *     R6.l=Y3
+        * note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
+        */
+        R3=R3+R0, R0=R3-R0;
+        R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
+        /*
+         *  Compute the odd portion (1,3,5,7) even is done.
+         *
+         *  Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
+         *  Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
+         *  Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
+         *  Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
+         */
+        //  R5=(Y1,Y7)  R6=(Y5,Y3)                                                   // R7=(C1,C7)
+        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || [TMP1]=R2 || R6.H=W[I2--];
+        A1-=R7.H*R5.L,       A0+=R7.L*R5.L (IS)       || I0-=4     || R7=[I3++];
+        A1+=R7.H*R6.H,       A0+=R7.L*R6.H (IS)       || I0+=M1;                     // R7=(C3,C5)
+        R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
+        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || R4=[TMP0];
+        A1+=R7.H*R5.L,       A0-=R7.L*R5.L (IS)       || I1+=M1    || R7=[I3++];     // R7=(C1,C7)
+        A1+=R7.L*R6.H,       A0-=R7.H*R6.H (IS);
+        R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
+        // R3=Y1, R2=Y7, R7=Y5, R6=Y3
+
+        /* Transpose write column. */
+        R5.H=R4+R2 (RND12);                                   // Y0=Y0+Y7
+        R5.L=R4-R2 (RND12) || R4 = [TMP1];                    // Y7=Y7-Y0
+        R2.H=R1+R7 (RND12) || W[P0++P3]=R5.H;                 // Y2=Y2+Y5 st Y0
+        R2.L=R1-R7 (RND12) || W[P1++P4]=R5.L || R7=[I3++];    // Y5=Y2-Y5 st Y7
+        R5.H=R0-R3 (RND12) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
+        R5.L=R0+R3 (RND12) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
+        R3.H=R4-R6 (RND12) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
+        R3.L=R4+R6 (RND12) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
+
+        /* pipeline loop start, + drain Y3, Y4 */
+        A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || W[P0++P2]= R3.H || R1.H = W[I0--];
+.1:     R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];
+
+
+
+    I0 = B2;                // I0 points to Input Element (0, 0)
+    I2 = B2;                // I2 points to Input Element (0, 0)
+    I2 += M3 || R0.H = W[I0];
+                            // Y0 is read in R0.H
+    I1 = I2;                // I1 points to input Element (0, 6)
+    I1 += 4  || R0.L = W[I2++];
+                            // I2 points to input Element (0, 4)
+                            // Y4 is read in R0.L
+    P2 = 8 (X);
+    I3 = B3;                // I3 points to Coefficients
+    P0 = B0;                // P0 points to array Element (0, 0) for writing
+                            // output
+    P1 = B0;
+    R7 = [I3++];            // R7.H = C4 and R7.L = C4
+    NOP;
+
+    /*
+     *   A1 =      Y0 * cos(pi/4)
+     *   A0 =      Y0 * cos(pi/4)
+     *   A1 = A1 + Y4 * cos(pi/4)
+     *   A0 = A0 - Y4 * cos(pi/4)
+     *   load:
+     *     R1=(Y2,Y6)
+     *     R7=(C2,C6)
+     *   res:
+     *     R3=Y0, R2=Y4
+     */
+    A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || I0+=4        || R1.L=W[I1++];
+    R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];
+
+    LSETUP (.2, .3) LC0 = P2; // peform 8 1d idcts
+    P2 = 112 (X);
+    P1 = P1 + P2;
+    P2 = -94(X);
+
+.2:
+        /*
+         *   A1 =      Y2 * cos(3pi/8)
+         *   A0 =      Y2 * cos(pi/8)
+         *   A1 = A1 - Y6 * cos(pi/8)
+         *   A0 = A0 + Y6 * cos(3pi/8)
+         *      R5 = (Y1,Y7)
+         *      R7 = (C1,C7)
+         *   res:
+         *      R1=Y2, R0=Y6
+         */
+        A1=R7.L*R1.H,       A0=R7.H*R1.H (IS)        || I0+=4        || R5.H=W[I0];
+        R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS)  || R5.L=W[I1--] || R7=[I3++];
+        /*
+        *   Y0 = Y0 + Y6.
+        *   Y4 = Y4 + Y2.
+        *   Y2 = Y4 - Y2.
+        *   Y6 = Y0 - Y6.
+        *     R3 is saved
+        *     R6.l=Y3
+        * note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
+        */
+        R3=R3+R0, R0=R3-R0;
+        R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
+        /*
+         *  Compute the odd portion (1,3,5,7) even is done.
+         *
+         *  Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
+         *  Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
+         *  Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
+         *  Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
+         */
+        //  R5=(Y1,Y7)  R6=(Y5,Y3)                                                   // R7=(C1,C7)
+        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || [TMP1]=R2 || R6.H=W[I2--];
+        A1-=R7.H*R5.L,       A0+=R7.L*R5.L (IS)       || I0-=4     || R7=[I3++];
+        A1+=R7.H*R6.H,       A0+=R7.L*R6.H (IS)       || I0+=M1;                     // R7=(C3,C5)
+        R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
+        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || R4=[TMP0];
+        A1+=R7.H*R5.L,       A0-=R7.L*R5.L (IS)       || I1+=M1    || R7=[I3++];     // R7=(C1,C7)
+        A1+=R7.L*R6.H,       A0-=R7.H*R6.H (IS);
+        R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
+        // R3=Y1, R2=Y7, R7=Y5, R6=Y3
+
+        /* Transpose write column. */
+        R5.H=R4+R2 (RND20);                                   // Y0=Y0+Y7
+        R5.L=R4-R2 (RND20) || R4 = [TMP1];                    // Y7=Y7-Y0
+        R2.H=R1+R7 (RND20) || W[P0++P3]=R5.H;                 // Y2=Y2+Y5 st Y0
+        R2.L=R1-R7 (RND20) || W[P1++P4]=R5.L || R7=[I3++];    // Y5=Y2-Y5 st Y7
+        R5.H=R0-R3 (RND20) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
+        R5.L=R0+R3 (RND20) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
+        R3.H=R4-R6 (RND20) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
+        R3.L=R4+R6 (RND20) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
+
+        /* pipeline loop start, + drain Y3, Y4 */
+        A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || W[P0++P2]= R3.H || R1.H = W[I0--];
+.3:     R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];
+
+    L3 = 0;
+    (R7:4,P5:3)=[SP++];
+    unlink;
+    RTS;
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bfin/pixels_bfin.S	Sun Apr 01 22:28:45 2007 +0000
@@ -0,0 +1,723 @@
+/*
+ * Blackfin Pixel Operations
+ * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config_bfin.h"
+
+DEFUN(put_pixels_clamped,mL1,
+        (DCTELEM *block, uint8_t *dest, int line_size)):
+    [--SP] = (R7:4);
+    R4 = 0;
+    R5.l = 0x00ff;
+    R5.h = 0x00ff;
+    I0 = R0;         // block
+    I1 = R1;         // dest
+    R2 += -4;        // line_size
+    M1 = R2;
+    P0 = 8;
+    R0 = [I0++];
+    R1 = [I0++];
+    R2 = MAX(R0, R4) (V);
+    LSETUP (ppc$0,ppc$1) LC0=P0;
+ppc$0: R2 = MIN(R2, R5) (V);
+       R3 = MAX(R1, R4) (V);
+       R3 = MIN(R3, R5) (V)      || R0 = [I0++];
+       R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
+       R2 = MAX(R0, R4) (V)      || [I1++] = R6;
+       R2 = MIN(R2, R5) (V);
+       R3 = MAX(R1, R4) (V);
+       R3 = MIN(R3, R5) (V)      || R0 = [I0++];
+       R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
+ppc$1: R2 = Max(R0, R4) (V)      || [I1++M1] = R6;
+
+    (R7:4) = [SP++];
+    RTS;
+
+DEFUN(add_pixels_clamped,mL1,
+        (DCTELEM *block, uint8_t *dest, int line_size)):
+    [-- SP] = (R7:4);
+    R4 = 0;
+    I0 = 0;
+    R2 += -4;        // line_size
+    M0 = R2;
+    I1 = R1;         // dest
+    I3 = R0;         // block
+    I2 = R1;         // dest
+    P0 = 8;
+    M3 = 2;
+    R0 = [I3++]  || R2 = [I1];
+    R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
+    R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
+    R6 = BYTEOP3P(R1:0, R3:2) (LO)    || R1.H = W[I3++]  || R2 = [I1];
+
+    LSETUP(apc$2,apc$3) LC1 = P0;
+apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R3 = [I1++M0];
+       R2 = R2 << 8                      || R0.H = W[I3--];
+       R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
+       R6 = R6 + R7 (S)                  || R1.H = W[I3];
+       R6 = BYTEOP3P(R1:0, R3:2) (LO)    || I3+=M3          || [I2++]=R6;
+       R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R2 = [I1];
+       R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
+       R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
+       R6 = R6 + R7 (S)                  || R1.H = W[I3++];
+apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO)    || [I2++M0] = R6   || R2 = [I1];
+
+    (R7:4) = [SP++];
+    RTS;
+
+
+/*
+  motion compensation
+  primitives
+
+     * Halfpel motion compensation with rounding (a+b+1)>>1.
+     * This is an array[4][4] of motion compensation funcions for 4
+     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
+     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+     * @param block destination where the result is stored
+     * @param pixels source
+     * @param line_size number of bytes in a horizontal line of block
+     * @param h height
+
+*/
+
+DEFUN(put_pixels8uc,mL1,
+        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
+                 int dest_size, int line_size, int h)):
+        i3=r0;        // dest
+        i0=r1;        // src0
+        i1=r2;        // src1
+        r0=[sp+12];   // dest_size
+        r2=[sp+16];   // line_size
+        p0=[sp+20];   // h
+        [--sp] = (r7:6);
+        r0+=-4;
+        m3=r0;
+        r2+=-8;
+        m0=r2;
+        LSETUP(pp8$0,pp8$1) LC0=P0;
+        DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];
+
+pp8$0:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
+        R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0]|| R2  =[I1++M0];
+        R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]  || [I3++] = R6 ;
+pp8$1:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;
+
+        (r7:6) = [sp++];
+        RTS;
+
+DEFUN(put_pixels16uc,mL1,
+        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
+                 int dest_size, int line_size, int h)):
+        link 0;
+        [--sp] = (r7:6);
+        i3=r0;        // dest
+        i0=r1;        // src0
+        i1=r2;        // src1
+        r0=[fp+20];   // dest_size
+        r2=[fp+24];   // line_size
+        p0=[fp+28];   // h
+
+
+        r0+=-12;
+        m3=r0;        // line_size
+        r2+=-16;
+        m0=r2;
+
+        LSETUP(pp16$0,pp16$1) LC0=P0;
+         DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];
+
+pp16$0:  DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
+         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++]   || R2  =[I1++];
+         R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++]   || R3  =[I1++];
+         [I3++] = R6;
+         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0] || R2  =[I1++M0];
+         R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]   || [I3++] = R7 ;
+         [I3++] = R6;
+pp16$1:  DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;
+
+        (r7:6) = [sp++];
+        unlink;
+        RTS;
+
+
+
+
+
+
+DEFUN(put_pixels8uc_nornd,mL1,
+        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
+                 int line_size, int h)):
+        i3=r0;        // dest
+        i0=r1;        // src0
+        i1=r2;        // src1
+        r2=[sp+12];   // line_size
+        p0=[sp+16];   // h
+        [--sp] = (r7:6);
+        r2+=-4;
+        m3=r2;
+        r2+=-4;
+        m0=r2;
+        LSETUP(pp8$2,pp8$3) LC0=P0;
+        DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];
+
+pp8$2:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
+        R6 = BYTEOP1P(R1:0,R3:2)(T)  || R0 = [I0++M0]|| R2  =[I1++M0];
+        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]  || [I3++] = R6 ;
+pp8$3:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;
+
+        (r7:6) = [sp++];
+        RTS;
+
+DEFUN(put_pixels16uc_nornd,mL1,
+        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
+                 int line_size, int h)):
+        i3=r0;        // dest
+        i0=r1;        // src0
+        i1=r2;        // src1
+        r2=[sp+12];   // line_size
+        p0=[sp+16];   // h
+
+        [--sp] = (r7:6);
+        r2+=-12;
+        m3=r2;        // line_size
+        r2+=-4;
+        m0=r2;
+
+        LSETUP(pp16$2,pp16$3) LC0=P0;
+        DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];
+
+pp16$2:
+        DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
+        R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++]   || R2  =[I1++];
+        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++]   || R3  =[I1++];
+        [I3++] = R6;
+
+        R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++M0] || R2  =[I1++M0];
+        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]   || [I3++] = R7 ;
+        [I3++] = R6;
+pp16$3: DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;
+
+        (r7:6) = [sp++];
+
+        RTS;
+
+DEFUN(z_put_pixels16_xy2,mL1,
+        (uint8_t *block, const uint8_t *s0,
+                 int dest_size, int line_size, int h)):
+        link 0;
+        [--sp] = (r7:4);
+        i3=r0;        // dest
+        i0=r1;        // src0--> pixels
+        i1=r1;        // src1--> pixels + line_size
+        r2+=-12;
+        m2=r2;        // m2=dest_width-4
+        r2=[fp+20];
+        m3=r2;        // line_size
+        p0=[fp+24];   // h
+        r2+=-16;
+        i1+=m3;       /* src1 + line_size */
+        m0=r2;        /* line-size - 20 */
+
+        B0 = I0;
+        B1 = I1;
+        B3 = I3;
+
+        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
+
+        LSETUP(LS$16E,LE$16E) LC0=P0;
+LS$16E: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
+        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++] || R2  =[I1++];
+        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ;
+        DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
+        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0]|| R2  = [I1++M0];
+        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
+LE$16E: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
+
+        M1 = 1;
+        I3 = B3;
+        I1 = B1;
+        I0 = B0;
+
+        I0 += M1;
+        I1 += M1;
+
+        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
+        LSETUP(LS$16O,LE$16O) LC0=P0;
+LS$16O: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
+        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++] || R2  =[I1++];
+        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6  =[I3++];
+        R4 = R4 +|+ R6                       || R7 = [I3--];
+        R5 = R5 +|+ R7                       || [I3++] = R4;
+        DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
+        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0]|| R2  = [I1++M0];
+        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++];
+        R4 = R4 +|+ R6                       || R7 = [I3--];
+        R5 = R5 +|+ R7                       || [I3++] = R4;
+LE$16O: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+
+DEFUN(put_pixels16_xy2_nornd,mL1,
+        (uint8_t *block, const uint8_t *s0,
+                 int line_size, int h)):
+        link 0;
+        [--sp] = (r7:4);
+        i3=r0;        // dest
+        i0=r1;        // src0--> pixels
+        i1=r1;        // src1--> pixels + line_size
+        m3=r2;
+        r2+=-12;
+        m2=r2;
+        r2+=-4;
+        i1+=m3;       /* src1 + line_size */
+        m0=r2;        /* line-size - 20 */
+        p0=[fp+20];   // h
+
+        B0=I0;
+        B1=I1;
+        B3=I3;
+
+        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
+
+        LSETUP(LS$16ET,LE$16ET) LC0=P0;
+LS$16ET:DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
+        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++] || R2  =[I1++];
+        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R1 = [I0++] || [I3++] = R4 ;
+        DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
+        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++M0]|| R2  = [I1++M0];
+        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R0 = [I0++] || [I3++] = R4 ;
+LE$16ET:DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
+
+        M1 = 1;
+        I3=B3;
+        I1=B1;
+        I0=B0;
+
+        I0 += M1;
+        I1 += M1;
+
+        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
+        LSETUP(LS$16OT,LE$16OT) LC0=P0;
+LS$16OT:DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
+        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++] || R2  =[I1++];
+        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R1 = [I0++] || R6  =[I3++];
+        R4 = R4 +|+ R6                                    || R7 = [I3--];
+        R5 = R5 +|+ R7                                    || [I3++] = R4;
+        DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
+        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++M0]|| R2  = [I1++M0];
+        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R0 = [I0++] || R6 = [I3++];
+        R4 = R4 +|+ R6                                    || R7 = [I3--];
+        R5 = R5 +|+ R7                                    || [I3++] = R4;
+LE$16OT:DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+
+DEFUN(z_put_pixels8_xy2,mL1,
+        (uint8_t *block, const uint8_t *s0,
+                 int dest_size, int line_size, int h)):
+        link 0;
+        [--sp] = (r7:4);
+        i3=r0;        // dest
+        i0=r1;        // src0--> pixels
+        i1=r1;        // src1--> pixels + line_size
+        r2+=-4;
+        m2=r2;        // m2=dest_width-4
+        r2=[fp+20];
+        m3=r2;        // line_size
+        p0=[fp+24];   // h
+        r2+=-8;
+        i1+=m3;       /* src1 + line_size */
+        m0=r2;        /* line-size - 20 */
+
+        b0 = I0;
+        b1 = I1;
+        b3 = I3;
+
+        LSETUP(LS$8E,LE$8E) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+LS$8E:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
+        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0] || R2  =[I1++M0];
+        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++]   || [I3++] = R4 ;
+LE$8E:  DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;
+
+        M1 = 1;
+        I3 = b3;
+        I1 = b1;
+        I0 = b0;
+
+        I0 += M1;
+        I1 += M1;
+
+        LSETUP(LS$8O,LE$8O) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+LS$8O:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
+        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0] || R2  =[I1++M0];
+        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++]   || R6  =[I3++];
+        R4 = R4 +|+ R6                                      || R7 = [I3--];
+        R5 = R5 +|+ R7                                      || [I3++] = R4;
+LE$8O:  DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+
+DEFUN(put_pixels8_xy2_nornd,mL1,
+        (uint8_t *block, const uint8_t *s0, int line_size, int h)):
+        link 0;
+        [--sp] = (r7:4);
+        i3=r0;        // dest
+        i0=r1;        // src0--> pixels
+        i1=r1;        // src1--> pixels + line_size
+        m3=r2;
+        r2+=-4;
+        m2=r2;
+        r2+=-4;
+        i1+=m3;       /* src1 + line_size */
+        m0=r2;        /* line-size - 20 */
+        p0=[fp+20];   // h
+
+
+        b0 = I0;
+        b1 = I1;
+        b3 = I3;
+
+        LSETUP(LS$8ET,LE$8ET) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+
+LS$8ET: DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
+        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++M0] || R2 = [I1++M0];
+        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R0 = [I0++]   || [I3++] = R4 ;
+LE$8ET: DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;
+
+        M1 = 1;
+        I3 = b3;
+        I1 = b1;
+        I0 = b0;
+
+        I0 += M1;
+        I1 += M1;
+
+        LSETUP(LS$8OT,LE$8OT) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2 = [I1++];
+
+LS$8OT: DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
+        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++M0] || R2 = [I1++M0];
+        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R0 = [I0++]   || R6 = [I3++];
+        R4 = R4 +|+ R6                                      || R7 = [I3--];
+        R5 = R5 +|+ R7                                      || [I3++] = R4;
+LE$8OT: DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+
+DEFUN(diff_pixels,mL1,
+       (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)):
+        link 0;
+        [--sp] = (r7:4);
+        p0=8;
+        i3=r0;        // block
+        i0=r1;        // s1
+        i1=r2;        // s2
+        r2=[fp+20];   // stride
+        r2+=-8;
+        m0=r2;
+
+
+        LSETUP(.LS0,.LE0) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+
+.LS0:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
+        (R5,R4) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
+        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || [I3++] = R4;
+        DISALGNEXCPT                       || R2 = [I1++]   || [I3++] = R5;
+        [i3++]=r6;
+.LE0:  [i3++]=r7;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+
+/*
+    for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++) {
+          sum += pix[j];
+        }
+        pix += line_size;
+    }
+*/
+DEFUN(pix_sum,mL1,
+        (uint8_t *p, int stride)):
+        link 0;
+        [--sp] = (r7:4);
+        p0=8;
+        i0=r0;        // s1
+        i1=r0;
+        m1=r1;
+        r1=r1+r1;
+        r1+=-16;       // stride
+        m0=r1;
+        i1+=m1;
+
+        r6=0;
+
+        LSETUP(LS$PS,LE$PS) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+
+LS$PS:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
+        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++]   || R2 = [I1++];
+        r6=r6+|+r5;
+        r6=r6+|+r4;
+        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++]   || R3 = [I1++];
+        r6=r6+|+r5;
+        r6=r6+|+r4;
+        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++m0] || R2 = [I1++m0];
+        r6=r6+|+r5;
+        r6=r6+|+r4;
+        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++]   || R2 = [I1++];
+        r6=r6+|+r5;
+LE$PS:  r6=r6+|+r4;
+        r0.l=r6.l+r6.h;
+        r0.h=0;
+
+        (r7:4) = [sp++];
+        unlink;
+        rts;
+
+
+DEFUN(get_pixels,mL1,
+        (DCTELEM *restrict block, const uint8_t *pixels, int line_size)):
+        [--sp] = (r7:4);
+        i3=r0;        // dest
+        i0=r1;        // src0
+        p0=8;
+        r2+=-8;
+        m0=r2;
+        LSETUP(gp8$0,gp8$1) LC0=P0;
+
+        DISALGNEXCPT                   || R0 = [I0++];
+        DISALGNEXCPT                   || R1 = [I0++];
+
+gp8$0:  (R7,R6) = byteunpack R1:0      || R0 = [I0++M0];
+        (R5,R4) = byteunpack R1:0 (R)  || R0 = [I0++]    || [I3++]=R6;
+        DISALGNEXCPT                   || R1 = [I0++]    || [I3++]=R7;
+        [I3++]=R4;
+gp8$1:  [I3++]=R5
+
+
+        (r7:4) = [sp++];
+        RTS;
+
+
+/* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */
+/* 91 cycles */
+DEFUN(z_sad16x16,mL1,
+        (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
+        link 0;
+        I0 = R0;
+        I1 = R1;
+
+        A1 = A0 = 0;
+        R0 = [sp+20]; // rwidth
+        P2 = [sp+24]; // height
+        R3 = 16;
+        R0 = R0 - R3;
+        R3 = R2 - R3;
+        M1 = R0;
+        M0 = R3;
+
+        DISALGNEXCPT         || R0 = [I0++]    || R2 = [I1++];
+        LSETUP (s$16, e$16) LC0=P2;
+s$16:   DISALGNEXCPT         || R1 = [I0++]    || R3 = [I1++];
+        SAA (R1:0,R3:2)      || R0 = [I0++]    || R2 = [I1++];
+        SAA (R1:0,R3:2) (R)  || R1 = [I0++]    || R3 = [I1++];
+        SAA (R1:0,R3:2)      || R0 = [I0++M0]  || R2 = [I1++M1];
+e$16:   SAA (R1:0,R3:2) (R)  || R0 = [I0++]    || R2 = [I1++];
+
+        R3=A1.L+A1.H,  R2=A0.L+A0.H ;
+        R0 = R2 + R3 ;
+        unlink;
+        RTS;
+
+/* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */
+/* 36 cycles */
+DEFUN(z_sad8x8,mL1,
+        (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
+        I0 = R0;
+        I1 = R1;
+
+        A1 = A0 = 0;
+        r0 = [sp+12]; // rwidth
+        P2 = [sp+16]; //height
+        R3 = 8;
+        R0 = R0 - R3;
+        R3 = R2 - R3;
+        M0 = R3;
+        M1 = R0;
+
+        LSETUP (s$8, e$8) LC0=P2;
+        DISALGNEXCPT         || R0 = [I0++]   || R2 = [I1++];
+        DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
+s$8:    SAA (R1:0,R3:2)      || R0 = [I0++M0] || R2 = [I1++M1];
+        SAA (R1:0,R3:2) (R)  || R0 = [I0++]   || R2 = [I1++];
+e$8:    DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
+
+        R3=A1.L+A1.H,  R2=A0.L+A0.H ;
+        R0 = R2 + R3 ;
+        RTS;
+
+DEFUN(pix_norm1,mL1,
+        (uint8_t * pix, int line_size)):
+        [--SP]=(R7:4,P5:3);
+
+        // Fetch the input arguments.
+        P1 = R0;  // pix
+        P0 = R1;  // line_size
+        P5 = 16;  // loop ctr.
+        P0 -= P5;
+        M0 = P0;  // M0 = line_size-16;
+        // Now for the real work.
+        A1 = A0 = 0;
+        lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
+        I0 = P1;
+        DISALGNEXCPT || r0 = [i0++];
+
+_pix_norm1_blkfn_loopStart:
+        // following unpacks pix1[0..15] pix1+line_size[0..15]
+        DISALGNEXCPT || r1 = [i0++];
+
+        (r5, r4) = byteunpack r1:0 || r0 = [i0++];
+        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
+        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
+        (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
+        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
+        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
+        (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
+        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
+        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
+        (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
+        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
+_pix_norm1_blkfn_loopEnd:
+        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
+
+
+// Clean up at the end:
+        R2 = A0, R3 = A1;
+        R0 = R2 + R3 (S);
+
+        (R7:4,P5:3)=[SP++];
+
+        RTS;
+
+DEFUN(sse4,mL1,
+        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
+        link 0;
+        [--sp] = (r7:6);
+        p0=[fp+24];   // h
+        i0=r1;        // pix1
+        i1=r2;        // pix2
+        r2=[fp+20];   // line_size
+        r2+=-4;
+        m0=r2;
+
+        a0=a1=0;
+        LSETUP(.S40,.E40) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+
+.S40:   DISALGNEXCPT                       || R1 = [I0++M0] || R3 = [I1++M0];
+        (R7,R6) = BYTEOP16M (R1:0,R3:2);
+        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
+.E40:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
+        a0 += a1;
+        r0 = a0;
+
+        (r7:6) = [sp++];
+        unlink;
+        rts;
+
+DEFUN(sse8,mL1,
+        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
+        link 0;
+        [--sp] = (r7:6);
+        p0=[fp+24];   // h
+        i0=r1;        // pix1
+        i1=r2;        // pix2
+        r2=[fp+20];   // line_size
+        r2+=-8;
+        m0=r2;
+
+        a0=a1=0;
+        LSETUP(.S80,.E80) LC0=P0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+
+.S80:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
+        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
+        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
+        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
+        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
+        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
+.E80:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
+        a0 += a1;
+        r0 = a0;
+
+        (r7:6) = [sp++];
+        unlink;
+        rts;
+
+DEFUN(sse16,mL1,
+        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
+        link 0;
+        [--sp] = (r7:6);
+        p0=[fp+24];   // h
+        i0=r1;        // pix1
+        i1=r2;        // pix2
+        r2=[fp+20];   // line_size
+        r2+=-16;
+        m0=r2;
+
+        a0=a1=0;
+        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
+        LSETUP(.S160,.E160) LC0=P0;
+
+.S160:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
+        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++]   || R2 = [I1++];
+        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
+        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
+        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++]   || R3 = [I1++];
+        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
+        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
+        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
+        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
+        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
+        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
+        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
+.E160:  a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
+        a0 += a1;
+        r0 = a0;
+
+        (r7:6) = [sp++];
+        unlink;
+        rts;
+
+