| Bill Allombert on Mon, 12 Apr 2004 14:31:45 +0200 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
| Inline assembly kernel for x86_64/amd64/ia32e 64-bit mode PARI |
Hello PARI-dev,
I am investigating what need to be done for proper x86_64/amd64/ia32e
optimized assembly support in PARI.
The good news is the attached patch that should enable inline assembly level0
kernel for x86_64/amd64/ia32e in 64-bit mode.
Some basic performance points:
The box is a AMD Opteron(tm) Processor 240 at 1400MHz with 1MB of cache
'make bench' is around 400ms instead of 600ms with this patch.
factor(2^256+1) take around 5s instead of 10s.
The bad news is that I don't have access x86_64 box with a standard
installation so I don't really know they behave, I know nothing about
x86_64 assembly and I am not better with plain x86 asm. This patch was
done by changing addl to addq, etc.
Before commiting this patch there are some issue to iron out :
--- Given the 3 names for the architecture, how to name the kernel ?
(I would choose x86_64 to match the Linux kernel uname value).
--- We need to tell apart 32-bit mode from 64-bit mode (easy) and use
either the ix86 kernel or the x86_64 one.
--- x86_64 in 64-bit mode seems to need -fPIC to build shared libraries,
but not in 32-bit mode.
--- The ix86 non-inline asm kernel is not PIC-aware. This is not a
problem in 32-bit mode, but just renaming addl to addq will not lead
to a PIC-aware 64-bit mode on-inline asm kernel...
--- Given the amount of changes to make the x86_64 level0.h from the ix86
one, I wonder if it could be worthwhile to use only one copy with
some CPP tricks.
Before I do that, please test the patch on a x86-64 with a more standard
installation so I get some idea of what is going on.
Cheers,
Bill.
Index: config/get_archos
===================================================================
RCS file: /home/cvs/pari/config/get_archos,v
retrieving revision 1.14
diff -u -r1.14 get_archos
--- config/get_archos 15 Oct 2003 12:34:30 -0000 1.14
+++ config/get_archos 12 Apr 2004 10:49:00 -0000
@@ -52,6 +52,7 @@
esac ;;
alpha) asmarch=$arch; pretty=Alpha ;;
ppc) asmarch=$arch; pretty='Power PC' ;;
+ x86_64) asmarch=$arch; pretty='amd64/ia32e' ;;
arm*) asmarch=none; pretty=$arch ;;
mips) asmarch=none; pretty=MIPS ;;
sh3) asmarch=none; pretty=SH-3 ;;
Index: config/get_cc
===================================================================
RCS file: /home/cvs/pari/config/get_cc,v
retrieving revision 1.22
diff -u -r1.22 get_cc
--- config/get_cc 21 Oct 2003 16:41:54 -0000 1.22
+++ config/get_cc 12 Apr 2004 10:49:00 -0000
@@ -119,7 +119,7 @@
DBGFLAGS=${DBGFLAGS:-"-g $warn"}
# Some architectures need -fPIC for building dynamic lib
case "$osname-$arch" in
- hpux-*|*-ia64) DLCFLAGS=-fPIC ;;
+ hpux-*|*-ia64|*-x86_64) DLCFLAGS=-fPIC ;;
darwin-*) DLCFLAGS=-fno-common;;
esac
# Specific optimisations for some architectures
--- /dev/null Wed Apr 10 22:14:05 2002
+++ src/kernel/x86_64/MakeLVL0.SH Mon Apr 12 02:08:23 2004
@@ -0,0 +1,12 @@
+# Level 0 kernel is "asm inline" if gcc and "asm extern" if not
+
+level0=$src/kernel/$kernlvl0
+none=$src/kernel/none
+
+cat >> $file << EOT
+parilvl0.h: $level0/asm0.h
+ cat $level0/asm0.h > parilvl0.h
+kernel\$(_O): $none/level0.h pariinl.h
+ \$(CC) -c \$(CFLAGS) \$(CPPFLAGS) -o kernel\$(_O) $none/level0.c
+EOT
+
--- /dev/null Wed Apr 10 22:14:05 2002
+++ src/kernel/x86_64/asm0.h Mon Apr 12 02:37:44 2004
@@ -0,0 +1,136 @@
+#line 2 "../src/kernel/x86-64/asm0.h"
+/* $Id: level0.h,v 1.9 2003/03/05 20:17:11 karim Exp $
+
+Copyright (C) 2004 The PARI group.
+
+This file is part of the PARI/GP package.
+
+PARI/GP is free software; you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation. It is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY WHATSOEVER.
+
+Check the License for details. You should have received a copy of it, along
+with the package; see the file 'COPYING'. If not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+
+/* Written by Bill Allombert from the ix86 version by Bruno Haible. Basically
+ * change insl to insq*/
+
+#ifndef ASMINLINE
+
+#define LOCAL_OVERFLOW
+#define LOCAL_HIREMAINDER
+
+BEGINEXTERN
+ extern ulong overflow, hiremainder;
+ extern long addll(ulong a, ulong b);
+ extern long addllx(ulong a, ulong b);
+ extern long subll(ulong a, ulong b);
+ extern long subllx(ulong a, ulong b);
+ extern long shiftl(ulong x, ulong y);
+ extern long shiftlr(ulong x, ulong y);
+ extern long mulll(ulong x, ulong y);
+ extern long addmul(ulong x, ulong y);
+ extern long divll(ulong x, ulong y);
+ extern long bfffo(ulong x);
+ENDEXTERN
+
+#else /* ASMINLINE */
+
+#define LOCAL_HIREMAINDER register ulong hiremainder
+#define LOCAL_OVERFLOW register ulong overflow
+
+/* Different assemblers have different syntax for the "shldl" and "shrdl"
+ instructions. */
+#if defined(__EMX__) || defined(__DJGCC__) || defined(__GO32__) || (defined(linux) && !defined(__ELF__)) || defined(__386BSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || defined(NeXT) || defined(__CYGWIN32__) || defined(__MINGW32__) || defined(COHERENT)
+# define SHCL "%%cl,"
+#else
+# define SHCL
+#endif
+
+
+#define addll(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+ __asm__ ("addq %3,%0 ; adcq %1,%1" \
+ : "=r" (__value), "=r" (overflow) \
+ : "0" (__arg1), "g" (__arg2), "1" ((ulong)0) \
+ : "cc"); \
+ __value; \
+})
+
+#define addllx(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b), __temp; \
+ __asm__ ("subq %5,%2 ; adcq %4,%0 ; adcq %1,%1" \
+ : "=r" (__value), "=r" (overflow), "=r" (__temp) \
+ : "0" (__arg1), "g" (__arg2), "g" (overflow), "1" ((ulong)0), "2" ((ulong)0) \
+ : "cc"); \
+ __value; \
+})
+
+#define subll(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+ __asm__ ("subq %3,%0 ; adcq %1,%1" \
+ : "=r" (__value), "=r" (overflow) \
+ : "0" (__arg1), "g" (__arg2), "1" ((ulong)0) \
+ : "cc"); \
+ __value; \
+})
+
+#define subllx(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b), __temp; \
+ __asm__ ("subq %5,%2 ; sbbq %4,%0 ; adcq %1,%1" \
+ : "=r" (__value), "=r" (overflow), "=r" (__temp) \
+ : "0" (__arg1), "g" (__arg2), "g" (overflow), "1" ((ulong)0), "2" ((ulong)0) \
+ : "cc"); \
+ __value; \
+})
+
+#define shiftl(a,c) \
+({ ulong __valuelo = (a), __count = (c), __valuehi; \
+ __asm__ ("shldq "SHCL"%2,%0" /* shift %0 left by %cl bits, feeding in %2 from the right */ \
+ : "=q" (__valuehi) \
+ : "0" ((ulong)0), "q" (__valuelo), "c" /* %ecx */ (__count)); \
+ hiremainder = __valuehi; \
+ __valuelo << __count; \
+})
+#define shiftlr(a,c) \
+({ ulong __valuehi = (a), __count = (c), __valuelo; \
+ __asm__ ("shrdq "SHCL"%2,%0" /* shift %0 right by %cl bits, feeding in %2 from the left */ \
+ : "=q" (__valuelo) \
+ : "0" ((ulong)0), "q" (__valuehi), "c" /* %ecx */ (__count)); \
+ hiremainder = __valuelo; \
+ __valuehi >> __count; \
+})
+
+#define mulll(a,b) \
+({ ulong __valuelo, __arg1 = (a), __arg2 = (b); \
+ __asm__ ("mulq %3" \
+ : "=a" /* %eax */ (__valuelo), "=d" /* %edx */ (hiremainder) \
+ : "0" (__arg1), "rm" (__arg2)); \
+ __valuelo; \
+})
+
+#define addmul(a,b) \
+({ ulong __valuelo, __arg1 = (a), __arg2 = (b), __temp; \
+ __asm__ ("mulq %4 ; addq %5,%0 ; adcq %6,%1" \
+ : "=a" /* %eax */ (__valuelo), "=&d" /* %edx */ (hiremainder), "=r" (__temp) \
+ : "0" (__arg1), "rm" (__arg2), "g" (hiremainder), "2" ((ulong)0)); \
+ __valuelo; \
+})
+
+#define divll(a,b) \
+({ ulong __value, __arg1 = (a), __arg2 = (b); \
+ __asm__ ("divq %4" \
+ : "=a" /* %eax */ (__value), "=d" /* %edx */ (hiremainder) \
+ : "0" /* %eax */ (__arg1), "1" /* %edx */ (hiremainder), "mr" (__arg2)); \
+ __value; \
+})
+
+#define bfffo(x) \
+({ ulong __arg = (x); \
+ long leading_one_position; \
+ __asm__ ("bsrq %1,%0" : "=r" (leading_one_position) : "rm" (__arg)); \
+ 63 - leading_one_position; \
+})
+#endif