Bill Allombert on Mon, 01 Nov 2004 00:24:15 +0100 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
Re: asm inline hppa level0 kernel |
On Fri, Oct 29, 2004 at 12:28:19PM +0200, Bill Allombert wrote: > > I made some research about hppa64, but apparently hppa64 does not > have 64x64->128bit multiply so doing an asm kernel might not worth > the trouble. In fact, not only there is no 64x64->128bit multiply, but there is no 64x64->64bit multiply on hppa64. The portable mulll use 3 64x64->64bit multiplies which will be emulated by 9 32x32->64 multiplies. However gcc -O3 is smart enough to see that 2 of the 64x64->64bit multiplies are in fact 32x32->64bit multiplies, so in practice it use only 5 32x32->64 multiplies. However the theoretical optimum is 3 32x32->64 multiplies. This implies that the 32x32->64 asm code for the 32bit hppa kernel could be reused in the 64bit mulll to improve performance. The attached patch implement a hppa64 inline asm kernel (to use with --kernel=hppa64). However I use 4 32x32->64 multiplies (using the naive formula). It can be possible to reach 3 using the classical formula x1*y2+x2*y2=(x1+x2)*(y1+y2)-x1*y1-x2*y2, but we have to take care that the addition (x1+x2) and (y1+y2) might overflow 32bit. Please test it! Of course the performance increase is not high, only 15% for the bench and 50% on large mulii. Cheers, Bill.
--- /dev/null Wed Jul 2 16:24:28 2003 +++ src/kernel/hppa64/MakeLVL0.SH Fri Oct 29 11:34:28 2004 @@ -0,0 +1,11 @@ +# Level 0 kernel is "asm extern" +kern=$src/kernel/$kernlvl0 +knone=$src/kernel/none + +cat >> $file << EOT +parilvl0.h: $kern/level0.h + cat $kern/level0.h $knone/divll.h > parilvl0.h +kernel\$(_O): .headers $kern/level0.h + \$(CC) -c \$(CFLAGS) \$(CPPFLAGS) -o kernel\$(_O) $knone/level0.c + +EOT --- /dev/null Wed Jul 2 16:24:28 2003 +++ src/kernel/hppa64/level0.h Sun Oct 31 23:34:22 2004 @@ -0,0 +1,148 @@ +#line 2 "../src/kernel/hppa64/level0.h" +/* $Id: level0.h,v 1.1 2004/10/29 09:50:00 bill Exp $ + +Copyright (C) 2004 The PARI group. + +This file is part of the PARI/GP package. + +PARI/GP is free software; you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation. It is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY WHATSOEVER. + +Check the License for details. You should have received a copy of it, along +with the package; see the file 'COPYING'. If not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +/* This file was made using idea from Bruno Haible ix86 asm inline kernel + * and code from Nigel Smart hppa asm kernel. mulll was inspired from + * longlong.h from the GNU MP package.*/ + +#ifndef ASMINLINE +#define LOCAL_OVERFLOW +#define LOCAL_HIREMAINDER + +extern ulong hiremainder, overflow; +extern long addll(ulong x, ulong y); +extern long addllx(ulong x, ulong y); +extern long subll(ulong x, ulong y); +extern long subllx(ulong x, ulong y); +extern long mulll(ulong x, ulong y); +extern long addmul(ulong x, ulong y); +extern long divll(ulong x, ulong y); +extern int bfffo(ulong x); + +#else /* ASMINLINE */ + +#define LOCAL_HIREMAINDER register ulong hiremainder +#define LOCAL_OVERFLOW register ulong overflow + +#define addll(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b); \ + __asm__ ("add %2,%3,%0\n\tadd,dc %%r0,%%r0,%1" \ + : "=r" (__value), "=r" (overflow) \ + : "r" (__arg1), "r" (__arg2) \ + : "cc"); \ + __value; \ +}) + +#define addllx(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b); \ + __asm__ ("sub %4,%5,%%r0\n\tadd,dc %2,%3,%0\n\tadd,dc %%r0,%%r0,%1" \ + : "=r" (__value), "=r" (overflow) \ + : "r" (__arg1), "r" (__arg2), "r" (overflow), "r" ((ulong) 1)\ + : "cc"); \ + __value; \ +}) + +#define subll(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b); \ + __asm__ ("sub %2,%3,%0\n\tadd,dc %%r0,%%r0,%1\n\tsubi 1,%1,%1" \ + : "=r" (__value), "=r" (overflow) \ + : "r" (__arg1), "r" (__arg2) \ + : "cc"); \ + __value; \ +}) + +#define subllx(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b); \ + __asm__ ("sub %%r0,%4,%%r0\n\tsub,db %2,%3,%0\n\tadd,dc %%r0,%%r0,%1\n\tsubi 1,%1,%1" \ + : "=&r" (__value), "=r" (overflow) \ + : "r" (__arg1), "r" (__arg2), "r" (overflow)\ + : "cc"); \ + __value; \ +}) + +/* z=a+b; c+= carry; return z */ +#define __addllc(a,b,c) \ +({ ulong __value, __arg1 = (a), __arg2 = (b); \ + __asm__ ("add %2,%3,%0\n\tadd,dc %4,%%r0,%1" \ + : "=&r" (__value), "=r" (c) \ + : "r" (__arg1), "r" (__arg2), "r" (c) \ + : "cc"); \ + __value; \ +}) + +/* 32x32->64 multiply*/ +#define __mulhh(a,b) \ +({ unsigned int __arg1 = (a), __arg2 = (b); \ + ulong __value; \ + __asm__ ("xmpyu %1,%2,%0" \ + : "=f" (__value) \ + : "f" (__arg1), "f" (__arg2) \ + : "cc"); \ + __value; \ +}) + +#define mulll(arg1,arg2) \ +({ \ + const ulong __x=(arg1), __y=(arg2); \ + const ulong __xlo = LOWWORD(__x), __xhi = HIGHWORD(__x); \ + const ulong __ylo = LOWWORD(__y), __yhi = HIGHWORD(__y); \ + ulong __xylo,__xymid,__xyhi,__xymidhi,__xymidlo; \ + ulong __xylh,__xyhl; \ + __xylo = __mulhh(__xlo,__ylo); __xyhi = __mulhh(__xhi,__yhi); \ + __xylh = __mulhh(__xlo,__yhi); __xyhl = __mulhh(__xhi,__ylo); \ + __xymid = __xylh+__xyhl; \ + if (__xymid<__xylh) __xyhi += (1UL << BITS_IN_HALFULONG); \ + __xymidhi = HIGHWORD(__xymid); \ + __xymidlo = __xymid << BITS_IN_HALFULONG; \ + __xylo = __addllc(__xylo,__xymidlo,__xyhi); \ + hiremainder = __xyhi + __xymidhi; \ + __xylo; \ +}) + +#define addmul(arg1,arg2) \ +({ \ + const ulong __x=(arg1), __y=(arg2); \ + const ulong __xlo = LOWWORD(__x), __xhi = HIGHWORD(__x); \ + const ulong __ylo = LOWWORD(__y), __yhi = HIGHWORD(__y); \ + ulong __xylo,__xymid,__xyhi,__xymidhi,__xymidlo; \ + ulong __xylh,__xyhl; \ + __xylo = __mulhh(__xlo,__ylo); __xyhi = __mulhh(__xhi,__yhi); \ + __xylh = __mulhh(__xlo,__yhi); __xyhl = __mulhh(__xhi,__ylo); \ + __xymid = __xylh+__xyhl; \ + if (__xymid<__xylh) __xyhi += (1UL << BITS_IN_HALFULONG); \ + __xylo = __addllc(__xylo,hiremainder,__xyhi); \ + __xymidhi = HIGHWORD(__xymid); \ + __xymidlo = __xymid << BITS_IN_HALFULONG; \ + __xylo = __addllc(__xylo,__xymidlo,__xyhi); \ + hiremainder = __xyhi + __xymidhi; \ + __xylo; \ +}) + +/* From Peter Montgomery */ + +#define bfffo(x) \ +({int __value; \ + ulong __arg1=(x); \ + static int __bfffo_tabshi[16]={4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0};\ + __value = BITS_IN_LONG - 4; \ + if (__arg1 & 0xffffffff00000000UL) {__value -= 32; __arg1 >>= 32;} \ + if (__arg1 > 0xffffUL) {__value -= 16; __arg1 >>= 16;} \ + if (__arg1 > 0x00ffUL) {__value -= 8; __arg1 >>= 8;} \ + if (__arg1 > 0x000fUL) {__value -= 4; __arg1 >>= 4;} \ + __value + __bfffo_tabshi[__arg1]; \ +}) + +#endif