Ilya Zakharevich on Thu, 12 Dec 2002 15:20:20 -0800


[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]

[PATCH oldish CVS] primelimit above 1<<31


Actually, the bug in initprimes0() has a one-line fix, but this patch
also fixes other places where the limit was signed, and adds some minimal docs.

Some minimal cache optimizations are also performed.  Now sieving up
to 4e9 takes less than 2min on Athlon/850 or Ultra Sparc/333.

Enjoy,
Ilya

--- ./src/basemath/arith2.c-pre	Wed Oct 23 14:58:22 2002
+++ ./src/basemath/arith2.c	Thu Dec 12 14:52:30 2002
@@ -116,7 +116,145 @@ initprimes1(ulong size, long *lenp, long
   return (byteptr) gprealloc(p,r-p);
 }
 
-#define PRIME_ARENA (200 * 1024) /* No slowdown even with 256K level-2 cache */
+/*  Timing in ms (Athlon/850; reports 512M of secondary cache; looks
+    like there is 64K of quickier cache too).
+
+    The algorithm of allocation starts to work regularly from
+    2pi(sqrt(lim)); we skip or double-quote what is less than this.
+
+
+     arena| 10m   30m   100m   300m   1000m   2000m    4000m
+     =================================================================
+	1K  360
+       1.5  250  1150
+	2K  210   840
+	3K  180   660   3160
+	4K  160   580   2530  11930  "36680" "84730" "262850"
+       12K  150   460   1730   5590   26010   65500  "184520"
+       20K  150   430   1500   4900   25450   56420   143220
+     ...
+       64K  140   410   1400   4360   22150   32850   122820
+
+    Timing relative to 64K:
+
+     20K    1.071  1.049  1.071  1.124  1.149  1.718  1.166
+     28K    1.071  1.000  1.043  1.067  1.115  1.256  1.326
+     36K    1.000  1.024  1.029  1.034  0.946  1.075  1.087
+     44K    1.000  1.049  1.007  1.016  1.006  1.056  0.974
+     52K    1.000  0.976  1.007  1.009  1.023  1.035  0.910
+     56K    0.929  1.024  1.000  1.002  1.023  1.022  0.893
+     58K    1.143  0.976  1.000  1.002  0.995  1.016  0.888
+     60K    1.000  1.024  1.000  0.998  1.016  1.010  0.865
+     62K    1.000  1.000  1.000  0.995  1.023  1.004  0.922
+     64K    1.000  1.000  1.000  1.000  1.000  1.000  1.000
+     66K    1.071  1.049  1.014  0.993  1.025  0.995  0.938
+     68K    1.071  1.098  1.086  1.050  0.939  0.991  0.926
+     70K    1.214  1.171  1.164  1.124  1.061  1.023  0.860
+     72K    1.214  1.244  1.221  1.181  1.070  1.081  0.852
+     76K    1.357  1.415  1.371  1.305  1.154  1.260  0.841
+     84K    1.643  1.610  1.614  1.546  1.340  1.426  0.954
+     92K    1.857  1.878  1.850  1.773  1.475  1.640  1.063
+     96K    2.000  2.073  2.057  2.016  1.380  1.710  1.127
+    128K    2.214  2.341  2.371  2.328  1.624  2.118  1.429
+    192K    2.500  2.707  2.743  2.729  1.901  2.495  1.626
+    256K    2.786  2.902  2.979  2.977  2.077  2.680  1.791
+    384K    3.143  3.293  3.371  3.392  2.344  3.050  1.906
+    512K    3.286  3.488  3.521  3.537  2.445  3.441  2.130
+    768K    3.857  4.171  4.264  4.319  2.977  4.045  2.490
+   1024K    4.429  4.707  4.879  4.947  3.435  4.900  2.941
+   1536K    5.071  5.659  6.029  6.197  4.393  6.237  3.662
+   2048K    5.357  6.195  6.593  6.995  4.857  6.886  4.039
+   3072K    5.786  6.488  7.029  7.356  5.264  7.425  4.402
+
+    Values after 92K are from different run...  Matters much for 4000m
+    one, when swapping starts; also 1000m run looks not very much
+    reproducible...
+
+    The stats for small arena lead to the value of ARENA_IN_ROOTS for i386.
+
+    Similar data for Sparc leads to 10.
+
+    TODO: One needs to create macro for (small) OVERHEAD_100_IN_ROOTS where the
+    overhead of subdivision is 100%; likewise one needs to estimate
+    the overhead of having the arena larger than the cache size.  Then
+    one can intelligently optimize the arena size taking both effects
+    into the account...
+*/
+
+#ifndef ARENA_IN_ROOTS
+#  ifdef i386		/* gcc defines this? */
+#    define ARENA_IN_ROOTS	1.5
+#  else
+#    define ARENA_IN_ROOTS	10
+#  endif
+#endif
+#ifndef PRIME_ARENA
+#  ifdef i386		/* gcc defines this? */
+   /* Due to smaller ARENA_IN_ROOTS, smaller arena is OK; fit level-1 cache */
+#    define PRIME_ARENA (63 * 1024) /* No slowdown even with 64K level-1 cache */
+#  else
+#    define PRIME_ARENA (200 * 1024) /* No slowdown even with 256K level-2 cache */
+#  endif
+#endif
+
+static long prime_arena = PRIME_ARENA;
+static double arena_in_roots = ARENA_IN_ROOTS;
+
+long
+good_arena_size(long rootnum, ulong remains, ulong primes)
+{
+  /* ARENA_IN_ROOTS below 12: some slowdown starts to be noticable
+   * when things fit into the cache.
+   * XXX The choice of 10 gives a slowdown of 1-2% on UltraSparcII,
+   * but makes calculations even for (the current) maximum of 436273009
+   * fit into 256K cache (still common for some architectures).
+   *
+   * One may change it when small caches become uncommon, but the gain
+   * is not going to be very noticable... */
+
+  long asize = arena_in_roots * rootnum;	/* Make % overhead negligeable. */
+  if (asize < prime_arena) asize = prime_arena - 1;
+  if (asize > remains) asize = remains;	/* + room for a sentinel byte */
+  if (2 * primes < asize)		/* XXXX Better substitutes for 2? */
+    asize -= primes;			/* We access arena AND primes */
+  return asize;
+}
+
+/* Use as in
+    install(set_internal,lLDG)		\\ Through some M too?
+    set_internal(2,100)	\\ disable dependence on limit
+
+    { time_primes_arena(ar,limit) =
+	set_internal(1,floor(ar*1024));
+	default(primelimit, 200 000);	\\ 100000 results in *larger* malloc()!
+	gettime;
+	default(primelimit, limit);
+	if(ar >= 1, ar=floor(ar));
+	print("arena "ar"K => "gettime"ms");
+    }
+*/
+
+long
+set_internal(long what, GEN g)
+{
+  long ret = 0;
+
+  if (what == 1)
+    ret = prime_arena;
+  else if (what == 2)
+    ret = arena_in_roots * 1000;
+  else
+    err(talker, "panic: set_internal");
+  if (g != NULL) {
+    long val = itos(g);
+
+    if (what == 1)
+      prime_arena = val;
+    else if (what == 2)
+      arena_in_roots = val / 1000.;
+  }
+  return ret;
+}
 
 /* Here's the workhorse.  This is recursive, although normally the first
    recursive call will bottom out and invoke initprimes1() at once.
@@ -151,26 +289,15 @@ initprimes0(ulong maxnum, long *lenp, ul
   fin1 = p1 + psize - 1;
   remains = (maxnum - rootnum) >> 1; /* number of odd numbers to check */
 
-  /* ARENA_IN_ROOTS below 12: some slowdown starts to be noticable
-   * when things fit into the cache.
-   * XXX The choice of 10 gives a slowdown of 1-2% on UltraSparcII,
-   * but makes calculations even for (the current) maximum of 436273009
-   * fit into 256K cache (still common for some architectures).
-   *
-   * One may change it when small caches become uncommon, but the gain
-   * is not going to be very noticable... */
-#define ARENA_IN_ROOTS	10
-
-  asize = ARENA_IN_ROOTS * rootnum;	/* Make % overhead negligeable. */
-  if (asize < PRIME_ARENA) asize = PRIME_ARENA;
-  if (asize > remains) asize = remains + 1;/* + room for a sentinel byte */
-  alloced = (avma - bot < (size_t)asize>>1); /* enough room on the stack ? */
+  asize = good_arena_size(rootnum, remains, psize);
+  /* enough room on the stack ? */
+  alloced = (((byteptr)avma) - ((byteptr)bot) <= (size_t)asize);
   if (alloced)
-    p = (byteptr) gpmalloc(asize);
+    p = (byteptr) gpmalloc(asize + 1);
   else
     p = (byteptr) bot;
-  fin = p + asize - 1;		/* the 0 sentinel goes at fin. */
-  curlow = rootnum + 2;		/* We know all primes up to rootnum (odd). */
+  fin = p + asize;		/* the 0 sentinel goes at fin. */
+  curlow = rootnum + 2;	/* First candidate: know primes up to rootnum (odd). */
   curdiff = fin1;
 
   /* During each iteration p..fin-1 represents a range of odd
@@ -181,10 +308,10 @@ initprimes0(ulong maxnum, long *lenp, ul
   {
     if (asize > remains)
     {
-      asize = remains + 1;
-      fin = p + asize - 1;
+      asize = remains;
+      fin = p + asize;
     }
-    memset(p, 0, asize);
+    memset(p, 0, asize + 1);
     /* p corresponds to curlow.  q runs over primediffs.  */
     /* Don't care about DIFFPTR_SKIP: false positives provide no problem */
     for (q = p1+2, k = 3; q <= fin1; k += *q++)
@@ -201,12 +328,29 @@ initprimes0(ulong maxnum, long *lenp, ul
        */
       long k2 = k*k - curlow;
 
-      if (k2 > 0)
-      {
-	r = p + (k2 >>= 1);
-	if (k2 > remains) break; /* Guard against an address wrap. */
-      } else
+#if 0					/* XXXX Check which one is quickier! */
+      if (k2 > 0) {			/* May be due to ulong==>long wrap */
+	k2 >>= 1;
+	if (k2 >= asize) {
+	  if (k2 > remains) {
+	    /* Can happen due to a conversion wrap only, so . */
+	    goto small_k;
+          } else
+	    break;			/* All primes up to sqrt(top) checked */
+	}
+	r = p + k2;
+      } else {
+       small_k:
 	r = p - (((curlow+k-2) % (2*k)) >> 1) + k - 1;
+      }
+#else
+      if (k2 > 0) {
+	r = p + (k2 >>= 1);
+	if (k2 <= remains) goto finish; /* Guard against an address wrap. */
+      }
+      r = p - (((curlow+k-2) % (2*k)) >> 1) + k - 1;
+    finish:
+#endif
       for (s = r; s < fin; s += k) *s = 1;
     }
     /* now q runs over addresses corresponding to primes */
@@ -221,9 +365,9 @@ initprimes0(ulong maxnum, long *lenp, ul
 	*curdiff++ = DIFFPTR_SKIP, d -= DIFFPTR_SKIP;
       *curdiff++ = d;
     }
-    plast -= asize - 1;
-    remains -= asize - 1;
-    curlow += ((asize - 1)<<1);
+    plast -= asize;
+    remains -= asize;
+    curlow += (asize<<1);
   } /* while (remains) */
   last = curlow - ((p - plast) << 1);
   *curdiff++ = 0;		/* sentinel */
--- ./src/headers/paricom.h-pre	Thu Oct 31 00:54:14 2002
+++ ./src/headers/paricom.h	Tue Nov 12 22:01:04 2002
@@ -287,6 +287,8 @@ extern int new_galois_format;
 #define mod4(x)   (modBIL(x) & 3)
 #define mod2(x)   (modBIL(x) & 1)
 #define is_pm1(n)    ((lgefint(n)==3) && (((GEN)(n))[2]==1))
+#define is_biguint(n) ((lgefint(n)>3) || \
+		       ((lgefint(n)==3) && signe(n) < 0))
 #define is_bigint(n) ((lgefint(n)>3) || \
 		      ((lgefint(n)==3) && ((((GEN)(n))[2]) < 0)))
 
--- ./src/language/sumiter.c-pre	Tue Oct 15 17:34:06 2002
+++ ./src/language/sumiter.c	Tue Nov 12 22:00:08 2002
@@ -99,8 +99,8 @@ forstep(entree *ep, GEN a, GEN b, GEN s,
 /* assume ptr is the address of a diffptr containing the succesive
  * differences between primes, and c = current prime (up to *p excluded)
  * return smallest prime >= a, update ptr */
-static long
-sinitp(long a, long c, byteptr *ptr)
+static ulong
+sinitp(ulong a, ulong c, byteptr *ptr)
 {
   byteptr p = *ptr;
   if (a <= 0) a = 2;
@@ -113,14 +113,14 @@ sinitp(long a, long c, byteptr *ptr)
 /* value changed during the loop, replace by the first prime whose
    value is strictly larger than new value */
 static void
-update_p(entree *ep, byteptr *ptr, long prime[])
+update_p(entree *ep, byteptr *ptr, ulong prime[])
 {
   GEN z = (GEN)ep->value;
-  long a, c;
+  ulong a, c;
 
   if (typ(z) == t_INT) a = 1; else { z = gceil(z); a = 0; }
-  if (is_bigint(z)) { prime[2] = -1; /* = infinity */ return; }
-  a += itos(z); c = prime[2];
+  if (is_biguint(z)) { prime[2] = -1; /* = infinity */ return; }
+  a += z[2]; c = prime[2];
   if (c < a)
     prime[2] = sinitp(a, c, ptr); /* increased */
   else if (c > a)
@@ -128,25 +128,29 @@ update_p(entree *ep, byteptr *ptr, long 
     *ptr = diffptr;
     prime[2] = sinitp(a, 0, ptr);
   }
-  changevalue_p(ep, prime);
+  changevalue_p(ep, (GEN)prime);
 }
 
 static byteptr
-prime_loop_init(GEN ga, GEN gb, long *a, long *b, long prime[])
+prime_loop_init(GEN ga, GEN gb, ulong *a, ulong *b, ulong prime[])
 {
   byteptr d = diffptr;
 
   ga = gceil(ga); gb = gfloor(gb);
   if (typ(ga) != t_INT || typ(gb) != t_INT)
     err(typeer,"prime_loop_init");
-  if (is_bigint(ga) || is_bigint(gb))
+  if (signe(gb) < 0)
+    return NULL;
+  if (signe(ga) < 0)
+    ga = gun;
+  if (is_biguint(ga) || is_biguint(gb))
   {
     if (cmpii(ga, gb) > 0) return NULL;
     err(primer1);
   }
-  *a = itos(ga); if (*a <= 0) *a = 1;
-  *b = itos(gb); if (*a > *b) return NULL;
-  maxprime_check((ulong)*b);
+  *a = (ulong)ga[2];
+  *b = (ulong)gb[2]; if (*a > *b) return NULL;
+  maxprime_check(*b);
   prime[2] = sinitp(*a, 0, &d);
   return d;
 }
@@ -154,8 +158,9 @@ prime_loop_init(GEN ga, GEN gb, long *a,
 void
 forprime(entree *ep, GEN ga, GEN gb, char *ch)
 {
-  long prime[] = {evaltyp(t_INT)|_evallg(3), evalsigne(1)|evallgefint(3), 0};
-  long a, b;
+  long p[] = {evaltyp(t_INT)|_evallg(3), evalsigne(1)|evallgefint(3), 0};
+  ulong *prime = (ulong*)p;
+  ulong a, b;
   pari_sp av = avma;
   byteptr d;
 
@@ -163,7 +168,7 @@ forprime(entree *ep, GEN ga, GEN gb, cha
   if (!d) { avma = av; return; }
 
   avma = av; push_val(ep, prime);
-  while ((ulong)prime[2] < (ulong)b)
+  while (prime[2] < b)
   {
     (void)lisseq(ch); if (loop_break()) break;
     if (ep->value == prime)
--- ./src/gp/gp.c-pre	Wed Oct 23 14:58:24 2002
+++ ./src/gp/gp.c	Sat Dec  7 00:22:24 2002
@@ -423,7 +423,7 @@ sd_ulong(char *v, int flag, char *s, ulo
   if (*v == 0) n = *ptn;
   else
   {
-    n = (ulong)get_int(v,0);
+    n = get_uint(v);
     if (*ptn == n) return gnil;
     if (n > Max || n < Min)
     {
@@ -766,7 +766,7 @@ static GEN
 sd_primelimit(char *v, int flag)
 {
   ulong n = primelimit;
-  GEN r = sd_ulong(v,flag,"primelimit",&n, 0,VERYBIGINT,NULL);
+  GEN r = sd_ulong(v,flag,"primelimit",&n, 0,2*(ulong)VERYBIGINT,NULL);
   if (n != primelimit)
   {
     if (flag != d_INITRC)