Re: asm inline hppa level0 kernel

Bill Allombert on Mon, 01 Nov 2004 00:24:15 +0100

[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

Re: asm inline hppa level0 kernel

To: pari-dev@list.cr.yp.to
Subject: Re: asm inline hppa level0 kernel
From: Bill Allombert <Bill.Allombert@math.u-bordeaux1.fr>
Date: Mon, 1 Nov 2004 00:17:02 +0100
Delivery-date: Mon, 01 Nov 2004 00:24:17 +0100
In-reply-to: <20041029102819.GS7001@seventeen>
Mailing-list: contact pari-dev-help@list.cr.yp.to; run by ezmlm
References: <20041028130557.GV7001@seventeen> <20041029102819.GS7001@seventeen>
User-agent: Mutt/1.3.28i

On Fri, Oct 29, 2004 at 12:28:19PM +0200, Bill Allombert wrote:
> 
> I made some research about hppa64, but apparently hppa64 does not
> have 64x64->128bit multiply so doing an asm kernel might not worth
> the trouble.

In fact, not only there is no 64x64->128bit multiply, but there is no 
64x64->64bit multiply on hppa64. The portable mulll use 3 64x64->64bit 
multiplies which will be emulated by 9 32x32->64 multiplies. However gcc -O3
is smart enough to see that 2 of the 64x64->64bit multiplies are in fact
32x32->64bit multiplies, so in practice it use only 5 32x32->64
multiplies. However the theoretical optimum is 3 32x32->64 multiplies.

This implies that the 32x32->64 asm code for the 32bit hppa kernel
could be reused in the 64bit mulll to improve performance.

The attached patch implement a hppa64 inline asm kernel (to use with
--kernel=hppa64). However I use 4 32x32->64 multiplies (using the
naive formula). It can be possible to reach 3 using the classical
formula x1*y2+x2*y2=(x1+x2)*(y1+y2)-x1*y1-x2*y2, but we have to
take care that the addition (x1+x2) and (y1+y2) might overflow 32bit.

Please test it!

Of course the performance increase is not high, only 15% for the bench
and 50% on large mulii.

Cheers,
Bill.

--- /dev/null	Wed Jul  2 16:24:28 2003
+++ src/kernel/hppa64/MakeLVL0.SH	Fri Oct 29 11:34:28 2004
@@ -0,0 +1,11 @@
+# Level 0 kernel is "asm extern"
+kern=$src/kernel/$kernlvl0
+knone=$src/kernel/none
+
+cat >> $file << EOT
+parilvl0.h: $kern/level0.h 
+	cat $kern/level0.h $knone/divll.h > parilvl0.h
+kernel\$(_O): .headers $kern/level0.h
+	\$(CC) -c \$(CFLAGS) \$(CPPFLAGS) -o kernel\$(_O) $knone/level0.c
+
+EOT
--- /dev/null	Wed Jul  2 16:24:28 2003
+++ src/kernel/hppa64/level0.h	Sun Oct 31 23:34:22 2004
@@ -0,0 +1,148 @@
+#line 2 "../src/kernel/hppa64/level0.h"
+/* $Id: level0.h,v 1.1 2004/10/29 09:50:00 bill Exp $
+
+Copyright (C) 2004  The PARI group.
+
+This file is part of the PARI/GP package.
+
+PARI/GP is free software; you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation. It is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY WHATSOEVER.
+
+Check the License for details. You should have received a copy of it, along
+with the package; see the file 'COPYING'. If not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+
+/* This file was made using idea from Bruno Haible ix86 asm inline kernel
+ * and code from Nigel Smart hppa asm kernel. mulll was inspired from
+ * longlong.h from the GNU MP package.*/
+
+#ifndef ASMINLINE
+#define LOCAL_OVERFLOW
+#define LOCAL_HIREMAINDER
+
+extern ulong hiremainder, overflow;
+extern long addll(ulong x, ulong y);
+extern long addllx(ulong x, ulong y);
+extern long subll(ulong x, ulong y);
+extern long subllx(ulong x, ulong y);
+extern long mulll(ulong x, ulong y);
+extern long addmul(ulong x, ulong y);
+extern long divll(ulong x, ulong y);
+extern int  bfffo(ulong x);
+
+#else /* ASMINLINE */
+
+#define LOCAL_HIREMAINDER  register ulong hiremainder
+#define LOCAL_OVERFLOW     register ulong overflow
+
+#define addll(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("add %2,%3,%0\n\tadd,dc %%r0,%%r0,%1" \
+        : "=r" (__value), "=r" (overflow) \
+        : "r" (__arg1), "r" (__arg2) \
+        : "cc"); \
+  __value; \
+})
+
+#define addllx(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("sub %4,%5,%%r0\n\tadd,dc %2,%3,%0\n\tadd,dc %%r0,%%r0,%1" \
+        : "=r" (__value), "=r" (overflow) \
+        : "r" (__arg1), "r" (__arg2), "r" (overflow), "r" ((ulong) 1)\
+        : "cc"); \
+  __value; \
+})
+
+#define subll(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("sub %2,%3,%0\n\tadd,dc %%r0,%%r0,%1\n\tsubi 1,%1,%1" \
+        : "=r" (__value), "=r" (overflow) \
+        : "r" (__arg1), "r" (__arg2) \
+        : "cc"); \
+  __value; \
+})
+
+#define subllx(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("sub %%r0,%4,%%r0\n\tsub,db %2,%3,%0\n\tadd,dc %%r0,%%r0,%1\n\tsubi 1,%1,%1" \
+        : "=&r" (__value), "=r" (overflow) \
+        : "r" (__arg1), "r" (__arg2), "r" (overflow)\
+        : "cc"); \
+  __value; \
+})
+
+/* z=a+b; c+= carry; return z */
+#define __addllc(a,b,c) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+   __asm__ ("add %2,%3,%0\n\tadd,dc %4,%%r0,%1" \
+        : "=&r" (__value), "=r" (c) \
+        : "r" (__arg1), "r" (__arg2), "r" (c) \
+        : "cc"); \
+  __value; \
+})
+
+/* 32x32->64 multiply*/
+#define __mulhh(a,b) \
+({ unsigned int __arg1 = (a), __arg2 = (b); \
+   ulong __value; \
+   __asm__ ("xmpyu %1,%2,%0" \
+        : "=f" (__value) \
+        : "f" (__arg1), "f" (__arg2) \
+        : "cc"); \
+   __value; \
+})
+
+#define mulll(arg1,arg2) \
+({ \
+  const ulong __x=(arg1), __y=(arg2); \
+  const ulong __xlo = LOWWORD(__x), __xhi = HIGHWORD(__x); \
+  const ulong __ylo = LOWWORD(__y), __yhi = HIGHWORD(__y); \
+  ulong __xylo,__xymid,__xyhi,__xymidhi,__xymidlo; \
+  ulong __xylh,__xyhl; \
+  __xylo = __mulhh(__xlo,__ylo); __xyhi = __mulhh(__xhi,__yhi); \
+  __xylh = __mulhh(__xlo,__yhi); __xyhl = __mulhh(__xhi,__ylo); \
+  __xymid = __xylh+__xyhl; \
+  if (__xymid<__xylh) __xyhi += (1UL << BITS_IN_HALFULONG); \
+  __xymidhi = HIGHWORD(__xymid); \
+  __xymidlo = __xymid << BITS_IN_HALFULONG; \
+  __xylo = __addllc(__xylo,__xymidlo,__xyhi); \
+  hiremainder = __xyhi + __xymidhi; \
+  __xylo; \
+})
+
+#define addmul(arg1,arg2) \
+({ \
+  const ulong __x=(arg1), __y=(arg2); \
+  const ulong __xlo = LOWWORD(__x), __xhi = HIGHWORD(__x); \
+  const ulong __ylo = LOWWORD(__y), __yhi = HIGHWORD(__y); \
+  ulong __xylo,__xymid,__xyhi,__xymidhi,__xymidlo; \
+  ulong __xylh,__xyhl; \
+  __xylo = __mulhh(__xlo,__ylo); __xyhi = __mulhh(__xhi,__yhi); \
+  __xylh = __mulhh(__xlo,__yhi); __xyhl = __mulhh(__xhi,__ylo); \
+  __xymid = __xylh+__xyhl; \
+  if (__xymid<__xylh) __xyhi += (1UL << BITS_IN_HALFULONG); \
+  __xylo = __addllc(__xylo,hiremainder,__xyhi); \
+  __xymidhi = HIGHWORD(__xymid); \
+  __xymidlo = __xymid << BITS_IN_HALFULONG; \
+  __xylo = __addllc(__xylo,__xymidlo,__xyhi); \
+  hiremainder = __xyhi + __xymidhi; \
+  __xylo; \
+})
+
+/* From Peter Montgomery */
+
+#define bfffo(x) \
+({int __value; \
+  ulong __arg1=(x); \
+  static int __bfffo_tabshi[16]={4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0};\
+  __value = BITS_IN_LONG - 4; \
+  if (__arg1 & 0xffffffff00000000UL) {__value -= 32; __arg1 >>= 32;} \
+  if (__arg1 > 0xffffUL) {__value -= 16; __arg1 >>= 16;} \
+  if (__arg1 > 0x00ffUL) {__value -= 8; __arg1 >>= 8;} \
+  if (__arg1 > 0x000fUL) {__value -= 4; __arg1 >>= 4;} \
+  __value + __bfffo_tabshi[__arg1]; \
+})
+
+#endif

Follow-Ups:
- Re: asm inline hppa level0 kernel
  - From: Bill Allombert <allomber@math.u-bordeaux.fr>

Next by Date: Opteron performance
Next by thread: Re: asm inline hppa level0 kernel
Index(es):
- Date
- Thread