| Jean-Pierre Flori on Thu, 14 Jan 2016 11:58:36 +0100 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
| Faster exponentiation in some extensions of finite fields of small characteristics |
Dear all, Here is a preliminary patch to speed up exponentiation in some extensions of finite fields of small characteristics (< size of machine word/2). It packs more coefficients of the polynomial representation over the prime field into machine words when the finite field element is mapped to a multiprecision integer (i.e. KS). Two functions are added: * one function which packs 4 coeffs into a machine word, * one generic funciton which packs the coeffs as much as possible possibly crossing machine words boundaries. I did not take care of adding new tuning parameters or smartly choosing between the different functions, e.g. calling the 4-coeffs-in-1-word function when the (product) coeff size is BITS_IN_LONG/4-\epsilon might be more efficient than using the generic function which does more complicated packing when the polynomial degree is not large. I did not add (yet) optimal packing when the product coeffs are larger than a machine word. I did not really check it is bug free. Best, -- Jean-Pierre Flori
diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c
index 715cb02..dd984b3 100644
--- a/src/basemath/Flx.c
+++ b/src/basemath/Flx.c
@@ -594,15 +594,28 @@ Flx_shiftip(pari_sp av, GEN x, long v)
avma = (pari_sp)y; return y;
}
+/* Return either number of words needed to store coeffs minus 1
+or minus number of coeffs to be stored in a word plus 1. */
INLINE long
maxlengthcoeffpol(ulong p, long n)
{
pari_sp ltop = avma;
GEN z = muliu(sqru(p-1), n);
long l = lgefint(z);
+ long w, b;
avma = ltop;
- if (l==3 && HIGHWORD(z[2])==0) return 0;
- return l-2;
+ if (l==3)
+ {
+ w = z[2] >> 3;
+ b = 3;
+ while (w != 0)
+ {
+ w >>= 1;
+ b++;
+ }
+ return b-BITS_IN_LONG;
+ }
+ return l-3;
}
INLINE ulong
@@ -708,6 +721,135 @@ Flx_mulspec_halfmulii(GEN a, GEN b, ulong p, long na, long nb)
return int_to_Flx_half(z,p);
}
+static GEN
+Flx_to_int_bitpack_word(GEN x, long s, long l)
+{
+ long i;
+ long shift = 0;
+ long n = (l*s-1)/BITS_IN_LONG+1;
+ GEN V = cgetipos(2+n);
+ GEN w = int_LSW(V);
+ *w = 0;
+ for (i=0; i<l-1; i++)
+ {
+ *w |= x[i] << shift;
+ shift += s;
+ if (shift >= BITS_IN_LONG)
+ {
+ w=int_nextW(w);
+ shift -= BITS_IN_LONG;
+ if (shift != 0)
+ *w = x[i] >> (s-shift);
+ else
+ *w = 0;
+ }
+ }
+ {
+ *w |= x[i] << shift;
+ shift += s;
+ if (shift > BITS_IN_LONG)
+ {
+ w=int_nextW(w);
+ shift -= BITS_IN_LONG;
+ *w = x[i] >> (s-shift);
+ }
+ }
+ return V;
+}
+
+static GEN
+int_to_Flx_bitunpack_word(GEN x, long s, ulong p)
+{
+ long i;
+ long lz = ((lgefint(x)-2)*BITS_IN_LONG-1)/s+3;
+ long shift = 0;
+ long mask = (1UL<<s)-1UL;
+ GEN w, z = cgetg(lz, t_VECSMALL);
+ w = int_LSW(x);
+ for (i=2; i<lz-1; i++)
+ {
+ z[i] = (((ulong) *w) >> shift) & mask;
+ shift += s;
+ if (shift >= BITS_IN_LONG)
+ {
+ shift -= BITS_IN_LONG;
+ w = int_nextW(w);
+ if (shift != 0)
+ z[i] |= (((ulong) *w) << (s-shift)) & mask;
+ }
+ z[i] %= p;
+ }
+ {
+ z[i] = (((ulong) *w) >> shift) & mask;
+ z[i] %= p;
+ }
+ return Flx_renormalize(z, lz);
+}
+
+static GEN
+Flx_mulspec_mulii_bitpack(GEN x, GEN y, long s, ulong p, long nx, long ny)
+{
+ GEN z = mulii(Flx_to_int_bitpack_word(x,s,nx), Flx_to_int_bitpack_word(y,s,ny));
+ return int_to_Flx_bitunpack_word(z, s, p);
+}
+
+#define BITS_IN_QUARTULONG (BITS_IN_HALFULONG >> 1)
+static GEN
+Flx_to_int_quartspec(GEN a, long na)
+{
+ long j;
+ long n = (na+3)>>2UL;
+ GEN V = cgetipos(2+n);
+ GEN w;
+ for (w = int_LSW(V), j=0; j+3<na; j+=4, w=int_nextW(w))
+ *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG)|(a[j+2]<<(2*BITS_IN_QUARTULONG))|(a[j+3]<<(3*BITS_IN_QUARTULONG));
+ switch (na-j)
+ {
+ case 3:
+ *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG)|(a[j+2]<<(2*BITS_IN_QUARTULONG));
+ break;
+ case 2:
+ *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG);
+ break;
+ case 1:
+ *w = a[j];
+ break;
+ case 0:
+ break;
+ }
+ return V;
+}
+
+#define QUARTMASK ((1UL<<BITS_IN_QUARTULONG)-1UL)
+#define LLQUARTWORD(x) ((x) & QUARTMASK)
+#define HLQUARTWORD(x) (((x) >> BITS_IN_QUARTULONG) & QUARTMASK)
+#define LHQUARTWORD(x) (((x) >> (2*BITS_IN_QUARTULONG)) & QUARTMASK)
+#define HHQUARTWORD(x) (((x) >> (3*BITS_IN_QUARTULONG)) & QUARTMASK)
+static GEN
+int_to_Flx_quart(GEN z, ulong p)
+{
+ long i;
+ long lx = (lgefint(z)-2)*4+2;
+ GEN w, x = cgetg(lx, t_VECSMALL);
+ for (w = int_LSW(z), i=2; i<lx; i+=4, w=int_nextW(w))
+ {
+ x[i] = LLQUARTWORD((ulong)*w)%p;
+ x[i+1] = HLQUARTWORD((ulong)*w)%p;
+ x[i+2] = LHQUARTWORD((ulong)*w)%p;
+ x[i+3] = HHQUARTWORD((ulong)*w)%p;
+ }
+ return Flx_renormalize(x, lx);
+}
+
+static GEN
+Flx_mulspec_quartmulii(GEN a, GEN b, ulong p, long na, long nb)
+{
+ GEN A = Flx_to_int_quartspec(a,na);
+ GEN B = Flx_to_int_quartspec(b,nb);
+ GEN z = mulii(A,B);
+ return int_to_Flx_quart(z,p);
+}
+
/*Eval x in 2^(k*BIL) in linear time, k==2 or 3*/
static GEN
Flx_eval2BILspec(GEN x, long k, long l)
@@ -774,7 +916,7 @@ static GEN
Flx_mulspec(GEN a, GEN b, ulong p, long na, long nb)
{
GEN a0,c,c0;
- long n0, n0a, i, v = 0;
+ long n0, n0a, i, v = 0, s;
pari_sp av;
while (na && !a[0]) { a++; na--; v++; }
@@ -783,23 +925,20 @@ Flx_mulspec(GEN a, GEN b, ulong p, long na, long nb)
if (!nb) return pol0_Flx(0);
av = avma;
- switch (maxlengthcoeffpol(p,nb))
+ s = maxlengthcoeffpol(p,nb);
+ switch ((s > 0) - (s < 0))
{
- case 0:
+ case -1:
if (na>=Flx_MUL_HALFMULII_LIMIT)
- return Flx_shiftip(av,Flx_mulspec_halfmulii(a,b,p,na,nb), v);
+ return Flx_shiftip(av,Flx_mulspec_mulii_bitpack(a,b,BITS_IN_LONG+s,p,na,nb), v);
break;
- case 1:
+ case 0:
if (na>=Flx_MUL_MULII_LIMIT)
return Flx_shiftip(av,Flx_mulspec_mulii(a,b,p,na,nb), v);
break;
- case 2:
+ case 1:
if (na>=Flx_MUL_MULII2_LIMIT)
- return Flx_shiftip(av,Flx_mulspec_mulii_inflate(a,b,2,p,na,nb), v);
- break;
- case 3:
- if (na>70)
- return Flx_shiftip(av,Flx_mulspec_mulii_inflate(a,b,3,p,na,nb), v);
+ return Flx_shiftip(av,Flx_mulspec_mulii_inflate(a,b,1+s,p,na,nb), v);
break;
}
if (nb < Flx_MUL_KARATSUBA_LIMIT)
@@ -910,6 +1049,20 @@ Flx_sqrspec_halfsqri(GEN a, ulong p, long na)
}
static GEN
+Flx_sqrspec_sqri_bitpack(GEN x, long s, ulong p, long nx)
+{
+ GEN z = sqri(Flx_to_int_bitpack_word(x,s,nx));
+ return int_to_Flx_bitunpack_word(z, s, p);
+}
+
+static GEN
+Flx_sqrspec_quartsqri(GEN a, ulong p, long na)
+{
+ GEN z = sqri(Flx_to_int_quartspec(a,na));
+ return int_to_Flx_quart(z,p);
+}
+
+static GEN
Flx_sqrspec_sqri_inflate(GEN x, long N, ulong p, long nx)
{
pari_sp av = avma;
@@ -921,30 +1074,27 @@ static GEN
Flx_sqrspec(GEN a, ulong p, long na)
{
GEN a0, c, c0;
- long n0, n0a, i, v = 0;
+ long n0, n0a, i, v = 0, s;
pari_sp av;
while (na && !a[0]) { a++; na--; v += 2; }
if (!na) return pol0_Flx(0);
av = avma;
- switch(maxlengthcoeffpol(p,na))
+ s = maxlengthcoeffpol(p,na);
+ switch ((s > 0) - (s < 0))
{
- case 0:
+ case -1:
if (na>=Flx_SQR_HALFSQRI_LIMIT)
- return Flx_shiftip(av, Flx_sqrspec_halfsqri(a,p,na), v);
+ return Flx_shiftip(av, Flx_sqrspec_sqri_bitpack(a,BITS_IN_LONG+s,p,na), v);
break;
- case 1:
+ case 0:
if (na>=Flx_SQR_SQRI_LIMIT)
return Flx_shiftip(av, Flx_sqrspec_sqri(a,p,na), v);
break;
- case 2:
+ case 1:
if (na>=Flx_SQR_SQRI2_LIMIT)
- return Flx_shiftip(av, Flx_sqrspec_sqri_inflate(a,2,p,na), v);
- break;
- case 3:
- if (na>70)
- return Flx_shiftip(av, Flx_sqrspec_sqri_inflate(a,3,p,na), v);
+ return Flx_shiftip(av, Flx_sqrspec_sqri_inflate(a,1+s,p,na), v);
break;
}
if (na < Flx_SQR_KARATSUBA_LIMIT)
@@ -1031,24 +1181,21 @@ static long
Flx_multhreshold(GEN T, ulong p, long half, long mul, long mul2, long kara)
{
long na = lgpol(T);
- switch (maxlengthcoeffpol(p,na))
+ long s = maxlengthcoeffpol(p,na);
+ switch ((s > 0) - (s < 0))
{
- case 0:
+ case -1:
if (na>=Flx_MUL_HALFMULII_LIMIT)
return na>=half;
break;
- case 1:
+ case 0:
if (na>=Flx_MUL_MULII_LIMIT)
return na>=mul;
break;
- case 2:
+ case 1:
if (na>=Flx_MUL_MULII2_LIMIT)
return na>=mul2;
break;
- case 3:
- if (na>=70)
- return na>=70;
- break;
}
return na>=kara;
}