Blackfin arch: Replace C version of 64 bit multiply with hand optimized assembly
authorBernd Schmidt <bernds_cb1@t-online.de>
Wed, 7 Jan 2009 15:14:39 +0000 (23:14 +0800)
committerBryan Wu <cooloney@kernel.org>
Wed, 7 Jan 2009 15:14:39 +0000 (23:14 +0800)
Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
arch/blackfin/lib/muldi3.S [new file with mode: 0644]
arch/blackfin/lib/muldi3.c [deleted file]

diff --git a/arch/blackfin/lib/muldi3.S b/arch/blackfin/lib/muldi3.S
new file mode 100644 (file)
index 0000000..abde120
--- /dev/null
@@ -0,0 +1,68 @@
+.align 2
+.global ___muldi3;
+.type ___muldi3, STT_FUNC;
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+
+/*
+          R1:R0 * R3:R2
+        = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
+[X]     = (R1.h * R3.h) * 2^96
+[X]       + (R1.h * R3.l + R1.l * R3.h) * 2^80
+[X]       + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
+[T1]      + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
+[T2]      + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
+[T3]      + (R0.l * R2.h + R2.l * R0.h) * 2^16
+[T4]      + (R0.l * R2.l)
+
+       We can discard the first three lines marked "X" since we produce
+       only a 64 bit result.  So, we need ten 16-bit multiplies.
+
+       Individual mul-acc results:
+[E1]    =  R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
+[E2]    =  R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
+[E3]    =  R0.l * R2.h + R2.l * R0.h
+[E4]    =  R0.l * R2.l
+
+       We also need to add high parts from lower-level results to higher ones:
+       E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
+
+       One interesting property is that all parts of the result that depend
+       on the sign of the multiplication are discarded.  Those would be the
+       multiplications involving R1.h and R3.h, but only the top 16 bit of
+       the 32 bit result depend on the sign, and since R1.h and R3.h only
+       occur in E1, the top half of these results is cut off.
+       So, we can just use FU mode for all of the 16-bit multiplies, and
+       ignore questions of when to use mixed mode.  */
+
+___muldi3:
+       /* [SP] technically is part of the caller's frame, but we can
+          use it as scratch space.  */
+       A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12];      /* E1 */
+       A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4;         /* E1 */
+       A0 += A1;                                                       /* E1 */
+       R4 = A0.w;
+       A0 = R0.l * R3.l (FU);                                          /* E2 */
+       A0 += R2.l * R1.l (FU);                                         /* E2 */
+
+       A1 = R2.L * R0.L (FU);                                          /* E4 */
+       R3 = A1.w;
+       A1 = A1 >> 16;                                                  /* E3c */
+       A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU);                      /* E2, E3c */
+       A1 += R0.L * R2.H (FU);                                         /* E3c */
+       R0 = A1.w;
+       A1 = A1 >> 16;                                                  /* E2c */
+       A0 += A1;                                                       /* E2c */
+       R1 = A0.w;
+
+       /* low(result) = low(E3c):low(E4) */
+       R0 = PACK (R0.l, R3.l);
+       /* high(result) = E2c + (E1 << 16) */
+       R1.h = R1.h + R4.l (NS) || R4 = [SP];
+       RTS;
+
+.size ___muldi3, .-___muldi3
diff --git a/arch/blackfin/lib/muldi3.c b/arch/blackfin/lib/muldi3.c
deleted file mode 100644 (file)
index 303d0c6..0000000
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * File:         arch/blackfin/lib/muldi3.c
- * Based on:
- * Author:
- *
- * Created:
- * Description:
- *
- * Modified:
- *               Copyright 2004-2006 Analog Devices Inc.
- *
- * Bugs:         Enter bugs at http://blackfin.uclinux.org/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see the file COPYING, or write
- * to the Free Software Foundation, Inc.,
- * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#ifndef SI_TYPE_SIZE
-#define SI_TYPE_SIZE 32
-#endif
-#define __ll_b (1L << (SI_TYPE_SIZE / 2))
-#define __ll_lowpart(t) ((usitype) (t) % __ll_b)
-#define __ll_highpart(t) ((usitype) (t) / __ll_b)
-#define BITS_PER_UNIT 8
-
-#if !defined(umul_ppmm)
-#define umul_ppmm(w1, w0, u, v)                                                \
-  do {                                                                 \
-    usitype __x0, __x1, __x2, __x3;                                    \
-    usitype __ul, __vl, __uh, __vh;                                    \
-                                                                       \
-    __ul = __ll_lowpart (u);                                           \
-    __uh = __ll_highpart (u);                                          \
-    __vl = __ll_lowpart (v);                                           \
-    __vh = __ll_highpart (v);                                          \
-                                                                       \
-    __x0 = (usitype) __ul * __vl;                                      \
-    __x1 = (usitype) __ul * __vh;                                      \
-    __x2 = (usitype) __uh * __vl;                                      \
-    __x3 = (usitype) __uh * __vh;                                      \
-                                                                       \
-    __x1 += __ll_highpart (__x0);/* this can't give carry */           \
-    __x1 += __x2;              /* but this indeed can */               \
-    if (__x1 < __x2)           /* did we get it? */                    \
-      __x3 += __ll_b;          /* yes, add it in the proper pos. */    \
-                                                                       \
-    (w1) = __x3 + __ll_highpart (__x1);                                        \
-    (w0) = __ll_lowpart (__x1) * __ll_b + __ll_lowpart (__x0);         \
-  } while (0)
-#endif
-
-#if !defined(__umulsidi3)
-#define __umulsidi3(u, v)                                              \
-  ({diunion __w;                                                        \
-       umul_ppmm (__w.s.high, __w.s.low, u, v);                         \
-           __w.ll; })
-#endif
-
-typedef unsigned int usitype __attribute__ ((mode(SI)));
-typedef int sitype __attribute__ ((mode(SI)));
-typedef int ditype __attribute__ ((mode(DI)));
-typedef int word_type __attribute__ ((mode(__word__)));
-
-struct distruct {
-       sitype low, high;
-};
-typedef union {
-       struct distruct s;
-       ditype ll;
-} diunion;
-
-#ifdef CONFIG_ARITHMETIC_OPS_L1
-ditype __muldi3(ditype u, ditype v)__attribute__((l1_text));
-#endif
-
-ditype __muldi3(ditype u, ditype v)
-{
-       diunion w;
-       diunion uu, vv;
-
-       uu.ll = u, vv.ll = v;
-       w.ll = __umulsidi3(uu.s.low, vv.s.low);
-       w.s.high += ((usitype) uu.s.low * (usitype) vv.s.high
-                    + (usitype) uu.s.high * (usitype) vv.s.low);
-
-       return w.ll;
-}