Bill Allombert on Tue, 20 Jan 2004 00:55:33 +0100


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

patch for Montgomery-style reduction in Flxq_pow


Hello Pari-dev,

Here a new patch that implement Montgomery-style reduction
for Flxq_pow.

This patch ned more tuning, and the inversion function would deserve
to not be quadratic.

Here the benchmark used:

---
install(Flx_invmontgomery,GL);
install(Flxq_pow,GGGL);
install(FpXQ_pow,GGGG);
install(Flx_redmontgomery,GGGL);
install(Flx_rem,GGL);
install(ZX_Flx,GL);
montgo1(P,p)=Flx_ZX(Flx_invmontgomery(ZX_Flx(P,p),p))
montgo2(P,p)=variable(P)/polrecip(P*Mod(1,p))+O(x^poldegree(P))
montgo(P,p)=montgo1(P,p)==montgo2(P,p)
{
  print("i\t:\ttmg\t:\tmon\t:\trem");
  for(k=1,15,e=2^k;
      P=Pol(vector(e,i,random(17)));
      Pv=ZX_Flx(P,17);
      gettime();
      mg=Flx_invmontgomery(Pv,17);
      tmg=gettime();
      Xv=ZX_Flx(x^(2*e-2),17);
      gettime();
      for(i=1,2^(16-k),Flx_rem(Xv,Pv,17));
      trem=gettime();
      for(i=1,2^(16-k),Flx_redmontgomery(Xv,mg,Pv,17));
      tmon=gettime();
      print(k,"\t:\t",tmg,"\t:\t",tmon,"\t:\t",trem);
  )
}
-----

i	:	tmg	:	mon	:	rem
1	:	0	:	50	:	40
2	:	0	:	30	:	30
3	:	0	:	20	:	20
4	:	0	:	20	:	10
5	:	0	:	30	:	20
6	:	0	:	30	:	30
7	:	0	:	50	:	50
8	:	0	:	70	:	90
9	:	0	:	100	:	170
10	:	20	:	150	:	340
11	:	80	:	220	:	670
12	:	310	:	330	:	1320
13	:	1220	:	510	:	2650
14	:	4920	:	760	:	5520
15	:	19750	:	1170	:	11430

(this is with the PARI kernel, GMP would be faster)

Cheers,
Bill.

Index: src/basemath/Flx.c
===================================================================
RCS file: /home/cvs/pari/src/basemath/Flx.c,v
retrieving revision 1.12
diff -u -r1.12 Flx.c
--- src/basemath/Flx.c	17 Jan 2004 18:47:50 -0000	1.12
+++ src/basemath/Flx.c	19 Jan 2004 23:12:08 -0000
@@ -233,6 +233,18 @@
   return Flx_renormalize(x,l);
 }
 
+/*UNCLEAN modify y[2] (FpX_Fp_add do the same)*/
+GEN
+Flx_Fl_add(GEN y,ulong x,ulong p)
+{
+  long v=varn(y);
+  if (lg(y)==2)
+    return Fl_Flx(x,v);
+  y[2] = adduumod((ulong)y[2],x,p);
+  if (y[2]==0 && degpol(y) == 0) return Flx_zero(v);
+  return y;
+}
+
 GEN
 Flx_addspec(GEN x, GEN y, ulong p, long lx, long ly)
 {
@@ -261,6 +273,27 @@
 }
 
 GEN
+Flx_subspec(GEN x, GEN y, ulong p, long lx, long ly)
+{
+  long i,lz;
+  GEN z;
+
+  if (ly <= lx)
+  {
+    lz = lx+2; z = cgetg(lz, t_VECSMALL)+2;
+    for (i=0; i<ly; i++) z[i] = (long)subuumod((ulong)x[i],(ulong)y[i],p);
+    for (   ; i<lx; i++) z[i] = x[i];
+  }
+  else
+  {
+    lz = ly+2; z = cgetg(lz, t_VECSMALL)+2;
+    for (i=0; i<lx; i++) z[i] = (long)subuumod((ulong)x[i],(ulong)y[i],p);
+    for (   ; i<ly; i++) z[i] = y[i]? (long)(p - y[i]): y[i];
+  }
+ return Flx_renormalize(z-2, lz);
+}
+
+GEN
 Flx_sub(GEN x, GEN y, ulong p)
 {
   long i,lz,lx = lg(x), ly = lg(y);
@@ -282,12 +315,21 @@
 }
 
 GEN
+Flx_negspec(GEN x, ulong p, long l)
+{
+  long i;
+  GEN z = cgetg(l+2, t_VECSMALL) + 2;
+  for (i=0; i<l; i++) 
+    z[i] = x[i]? (long)p - x[i]: 0;
+  return z-2;
+}
+
+
+GEN
 Flx_neg(GEN x, ulong p)
 {
-  long i, l = lg(x);
-  GEN z = cgetg(l, t_VECSMALL);
+  GEN z = Flx_negspec(x+2, p, lgpol(x));
   z[1] = x[1];
-  for (i=2; i<l; i++) z[i] = x[i]? (long)p - x[i]: 0;
   return z;
 }
 
@@ -426,7 +468,7 @@
     for (  ; i<nx; i++) z[i] = (long)Flx_mullimb(x+i,y,p,0,ny);
     for (  ; i<nz; i++) z[i] = (long)Flx_mullimb(x+i,y,p,i-nx+1,ny);
   }
-  z -= 2; return Flx_renormalize(z, lz);
+  z -= 2; return z; 
 }
 
 INLINE GEN
@@ -541,7 +583,7 @@
       z[i] = (long)p1;
     }
   }
-  z -= 2; return Flx_renormalize(z,lz);
+  z -= 2; return z;
 }
 
 GEN
@@ -780,6 +822,80 @@
 }
 
 GEN
+Flx_recipspec(GEN x, long l, long n)
+{
+  long i;
+  GEN z=cgetg(n+2,t_VECSMALL)+2;
+  for(i=0; i<l; i++)
+    z[n-i-1] = x[i];
+  for(   ; i<n; i++)
+    z[n-i-1] = 0;
+  return Flx_renormalize(z-2,n+2);
+}
+
+GEN
+Flx_recip(GEN x)
+{
+  GEN z=Flx_recipspec(x+2,lgpol(x),lgpol(x));
+  z[1]=x[1];
+  return z;
+}
+
+/*
+ * x/polrecip(P)+O(x^n)
+ */
+GEN Flx_invmontgomery(GEN T, ulong p)
+{
+  pari_sp ltop=avma;
+  long i, l=lg(T), k;
+  long vt=T[1];
+  GEN r;
+  ulong c=T[l-1], ci=1;
+  if (l<5) return Flx_zero(vt);
+  if (c!=1)
+  {
+    ci=invumod(c,p);
+    T=Flx_Fl_mul(T, ci, p);
+  }
+  r=cgetg(l-1,t_VECSMALL); r[1]=vt;
+  r[2]=0; r[3]=1;
+  for (i=4;i<l-1;i++)
+  {
+    r[i] = 0;
+    for (k=3;k<i;k++)
+      r[i]= (long) subuumod((ulong)r[i],muluumod((ulong)T[l-i+k-1],(ulong)r[k],p),p);
+  }
+  r = Flx_renormalize(r,l-1);
+  if (c!=1)  r=Flx_Fl_mul(r,ci,p);
+  return gerepileupto(ltop, r);
+}
+
+/* Compute x mod T where lg(x)<=2*lg(T)-2
+ * and mg is the Montgomery inverse of T. 
+ */
+GEN Flx_redmontgomery(GEN x, GEN mg, GEN T, ulong p)
+{
+  pari_sp ltop=avma;
+  GEN z;
+  long l=lgpol(x);
+  long lt=degpol(T); /*We discard the leading term*/
+  long lead=lt-1;
+  long ld=l-lt+1;
+  long lm=min(ld,lgpol(mg));
+  if (l<=lt)
+    return Flx_copy(x);
+  new_chunk(lt);
+  z=Flx_recipspec(x+2+lead,ld,ld);              /* z = rec(x)*/
+  z=Flx_mulspec(z+2,mg+2,p,min(ld,lgpol(z)),lm);/* z = rec(x) * mg */
+  z=Flx_recipspec(z+2,lgpol(z),ld);             /* z = rec (rec(x) * mg) */
+  z=Flx_mulspec(z+2,T+2,p,min(ld,lgpol(z)),lt); /* z *= pol */
+  avma=ltop;
+  z=Flx_subspec(x+2,z+2,p,lt,min(lt,lgpol(z))); /* z = x - z */
+  z[1]=T[1];
+  return z;
+}
+
+GEN
 Flx_deriv(GEN z, ulong p)
 {
   long i,l = lg(z)-1;
@@ -1098,6 +1214,7 @@
 
 typedef struct {
   GEN pol;
+  GEN mg;
   ulong p;
 } Flxq_muldata;
 
@@ -1106,13 +1223,13 @@
 _u_sqr(void *data, GEN x)
 {
   Flxq_muldata *D = (Flxq_muldata*)data;
-  return Flxq_sqr(x, D->pol, D->p);
+  return Flx_redmontgomery(Flx_sqr(x,D->p),D->mg, D->pol, D->p);
 }
 static GEN
 _u_mul(void *data, GEN x, GEN y)
 {
   Flxq_muldata *D = (Flxq_muldata*)data;
-  return Flxq_mul(x,y, D->pol, D->p);
+  return Flx_redmontgomery(Flx_mul(x,y,D->p),D->mg, D->pol, D->p);
 }
 
 /* n-Power of x in Z/pZ[X]/(pol), as t_VECSMALL. */
@@ -1125,6 +1242,8 @@
   GEN y;
   D.pol = pol;
   D.p   = p;
+  D.mg  = Flx_invmontgomery(pol,p);
+  x=Flx_rem(x, pol, p);
   y = leftright_pow(x, n, (void*)&D, &_u_sqr, &_u_mul);
   return gerepileupto(av, y);
 }