Bill Allombert on Mon, 12 Apr 2004 14:31:45 +0200 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
Inline assembly kernel for x86_64/amd64/ia32e 64-bit mode PARI |
Hello PARI-dev, I am investigating what need to be done for proper x86_64/amd64/ia32e optimized assembly support in PARI. The good news is the attached patch that should enable inline assembly level0 kernel for x86_64/amd64/ia32e in 64-bit mode. Some basic performance points: The box is a AMD Opteron(tm) Processor 240 at 1400MHz with 1MB of cache 'make bench' is around 400ms instead of 600ms with this patch. factor(2^256+1) take around 5s instead of 10s. The bad news is that I don't have access x86_64 box with a standard installation so I don't really know they behave, I know nothing about x86_64 assembly and I am not better with plain x86 asm. This patch was done by changing addl to addq, etc. Before commiting this patch there are some issue to iron out : --- Given the 3 names for the architecture, how to name the kernel ? (I would choose x86_64 to match the Linux kernel uname value). --- We need to tell apart 32-bit mode from 64-bit mode (easy) and use either the ix86 kernel or the x86_64 one. --- x86_64 in 64-bit mode seems to need -fPIC to build shared libraries, but not in 32-bit mode. --- The ix86 non-inline asm kernel is not PIC-aware. This is not a problem in 32-bit mode, but just renaming addl to addq will not lead to a PIC-aware 64-bit mode on-inline asm kernel... --- Given the amount of changes to make the x86_64 level0.h from the ix86 one, I wonder if it could be worthwhile to use only one copy with some CPP tricks. Before I do that, please test the patch on a x86-64 with a more standard installation so I get some idea of what is going on. Cheers, Bill.
Index: config/get_archos =================================================================== RCS file: /home/cvs/pari/config/get_archos,v retrieving revision 1.14 diff -u -r1.14 get_archos --- config/get_archos 15 Oct 2003 12:34:30 -0000 1.14 +++ config/get_archos 12 Apr 2004 10:49:00 -0000 @@ -52,6 +52,7 @@ esac ;; alpha) asmarch=$arch; pretty=Alpha ;; ppc) asmarch=$arch; pretty='Power PC' ;; + x86_64) asmarch=$arch; pretty='amd64/ia32e' ;; arm*) asmarch=none; pretty=$arch ;; mips) asmarch=none; pretty=MIPS ;; sh3) asmarch=none; pretty=SH-3 ;; Index: config/get_cc =================================================================== RCS file: /home/cvs/pari/config/get_cc,v retrieving revision 1.22 diff -u -r1.22 get_cc --- config/get_cc 21 Oct 2003 16:41:54 -0000 1.22 +++ config/get_cc 12 Apr 2004 10:49:00 -0000 @@ -119,7 +119,7 @@ DBGFLAGS=${DBGFLAGS:-"-g $warn"} # Some architectures need -fPIC for building dynamic lib case "$osname-$arch" in - hpux-*|*-ia64) DLCFLAGS=-fPIC ;; + hpux-*|*-ia64|*-x86_64) DLCFLAGS=-fPIC ;; darwin-*) DLCFLAGS=-fno-common;; esac # Specific optimisations for some architectures --- /dev/null Wed Apr 10 22:14:05 2002 +++ src/kernel/x86_64/MakeLVL0.SH Mon Apr 12 02:08:23 2004 @@ -0,0 +1,12 @@ +# Level 0 kernel is "asm inline" if gcc and "asm extern" if not + +level0=$src/kernel/$kernlvl0 +none=$src/kernel/none + +cat >> $file << EOT +parilvl0.h: $level0/asm0.h + cat $level0/asm0.h > parilvl0.h +kernel\$(_O): $none/level0.h pariinl.h + \$(CC) -c \$(CFLAGS) \$(CPPFLAGS) -o kernel\$(_O) $none/level0.c +EOT + --- /dev/null Wed Apr 10 22:14:05 2002 +++ src/kernel/x86_64/asm0.h Mon Apr 12 02:37:44 2004 @@ -0,0 +1,136 @@ +#line 2 "../src/kernel/x86-64/asm0.h" +/* $Id: level0.h,v 1.9 2003/03/05 20:17:11 karim Exp $ + +Copyright (C) 2004 The PARI group. + +This file is part of the PARI/GP package. + +PARI/GP is free software; you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation. It is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY WHATSOEVER. + +Check the License for details. You should have received a copy of it, along +with the package; see the file 'COPYING'. If not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +/* Written by Bill Allombert from the ix86 version by Bruno Haible. Basically + * change insl to insq*/ + +#ifndef ASMINLINE + +#define LOCAL_OVERFLOW +#define LOCAL_HIREMAINDER + +BEGINEXTERN + extern ulong overflow, hiremainder; + extern long addll(ulong a, ulong b); + extern long addllx(ulong a, ulong b); + extern long subll(ulong a, ulong b); + extern long subllx(ulong a, ulong b); + extern long shiftl(ulong x, ulong y); + extern long shiftlr(ulong x, ulong y); + extern long mulll(ulong x, ulong y); + extern long addmul(ulong x, ulong y); + extern long divll(ulong x, ulong y); + extern long bfffo(ulong x); +ENDEXTERN + +#else /* ASMINLINE */ + +#define LOCAL_HIREMAINDER register ulong hiremainder +#define LOCAL_OVERFLOW register ulong overflow + +/* Different assemblers have different syntax for the "shldl" and "shrdl" + instructions. */ +#if defined(__EMX__) || defined(__DJGCC__) || defined(__GO32__) || (defined(linux) && !defined(__ELF__)) || defined(__386BSD__) || defined(__NetBSD__) || defined(__FreeBSD__) || defined(NeXT) || defined(__CYGWIN32__) || defined(__MINGW32__) || defined(COHERENT) +# define SHCL "%%cl," +#else +# define SHCL +#endif + + +#define addll(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b); \ + __asm__ ("addq %3,%0 ; adcq %1,%1" \ + : "=r" (__value), "=r" (overflow) \ + : "0" (__arg1), "g" (__arg2), "1" ((ulong)0) \ + : "cc"); \ + __value; \ +}) + +#define addllx(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b), __temp; \ + __asm__ ("subq %5,%2 ; adcq %4,%0 ; adcq %1,%1" \ + : "=r" (__value), "=r" (overflow), "=r" (__temp) \ + : "0" (__arg1), "g" (__arg2), "g" (overflow), "1" ((ulong)0), "2" ((ulong)0) \ + : "cc"); \ + __value; \ +}) + +#define subll(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b); \ + __asm__ ("subq %3,%0 ; adcq %1,%1" \ + : "=r" (__value), "=r" (overflow) \ + : "0" (__arg1), "g" (__arg2), "1" ((ulong)0) \ + : "cc"); \ + __value; \ +}) + +#define subllx(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b), __temp; \ + __asm__ ("subq %5,%2 ; sbbq %4,%0 ; adcq %1,%1" \ + : "=r" (__value), "=r" (overflow), "=r" (__temp) \ + : "0" (__arg1), "g" (__arg2), "g" (overflow), "1" ((ulong)0), "2" ((ulong)0) \ + : "cc"); \ + __value; \ +}) + +#define shiftl(a,c) \ +({ ulong __valuelo = (a), __count = (c), __valuehi; \ + __asm__ ("shldq "SHCL"%2,%0" /* shift %0 left by %cl bits, feeding in %2 from the right */ \ + : "=q" (__valuehi) \ + : "0" ((ulong)0), "q" (__valuelo), "c" /* %ecx */ (__count)); \ + hiremainder = __valuehi; \ + __valuelo << __count; \ +}) +#define shiftlr(a,c) \ +({ ulong __valuehi = (a), __count = (c), __valuelo; \ + __asm__ ("shrdq "SHCL"%2,%0" /* shift %0 right by %cl bits, feeding in %2 from the left */ \ + : "=q" (__valuelo) \ + : "0" ((ulong)0), "q" (__valuehi), "c" /* %ecx */ (__count)); \ + hiremainder = __valuelo; \ + __valuehi >> __count; \ +}) + +#define mulll(a,b) \ +({ ulong __valuelo, __arg1 = (a), __arg2 = (b); \ + __asm__ ("mulq %3" \ + : "=a" /* %eax */ (__valuelo), "=d" /* %edx */ (hiremainder) \ + : "0" (__arg1), "rm" (__arg2)); \ + __valuelo; \ +}) + +#define addmul(a,b) \ +({ ulong __valuelo, __arg1 = (a), __arg2 = (b), __temp; \ + __asm__ ("mulq %4 ; addq %5,%0 ; adcq %6,%1" \ + : "=a" /* %eax */ (__valuelo), "=&d" /* %edx */ (hiremainder), "=r" (__temp) \ + : "0" (__arg1), "rm" (__arg2), "g" (hiremainder), "2" ((ulong)0)); \ + __valuelo; \ +}) + +#define divll(a,b) \ +({ ulong __value, __arg1 = (a), __arg2 = (b); \ + __asm__ ("divq %4" \ + : "=a" /* %eax */ (__value), "=d" /* %edx */ (hiremainder) \ + : "0" /* %eax */ (__arg1), "1" /* %edx */ (hiremainder), "mr" (__arg2)); \ + __value; \ +}) + +#define bfffo(x) \ +({ ulong __arg = (x); \ + long leading_one_position; \ + __asm__ ("bsrq %1,%0" : "=r" (leading_one_position) : "rm" (__arg)); \ + 63 - leading_one_position; \ +}) +#endif