| Peter Bruin on Thu, 07 May 2015 12:09:55 +0200 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
| Re: Strassen multiplication over the integers |
Hello Bill, [Sorry if there is any double post; I tried to send a reply before, but it seems to have disappeared.] >> > - ideally the program src/test/tune.c should deal with the tuning. >> > This program deal with dependencies between tuning parameters. >> >> OK, I hope to have some time soon to try to write an addition for >> src/test/tune.c for this. > > Do not waste too much time with tune.c. The most important is to find > out how the tuning is affected by the coefficient size. (I have not yet done this, but hope to look at it soon.) > By the way, what matrix size your project is using ? In a currently reasonable case (computations with a curve of genus 22 over a finite field), one has to do matrix multiplications up to size (361 x 69) * (69 x 67). In a harder but possibly reachable case (a curve of genus 40), all sizes will be about twice as large. >> > - Most of the code is fairly generic, so maybe there could be a >> > gen_matmul_sw function. I have now implemented this; see the attached patch. It is completely analogous to ZM_mul_sw. There are some new tests to ensure that the new code is covered. >> Indeed, and I actually already have an implementation of exactly this >> function, but it is an older version with awkward conventions for the >> indices. I will try to update this soon. (Unfortunately it will be >> hard to tune that one...) > > Well, we might let the caller provide the tuning parameter. I did not see a clearly best way to implement this; the tuning parameter is currently the fixed value 24. This seems to be a good value in the cases I tested (matrices over a field of size p^2, where p = 2^80 + 13). At some point, we could implement functions F2xqM_mul_Kronecker and FpXQM_mul_Kronecker, which would mean that we only need Strassen multiplication (and the tuning that comes with it) for ZM_mul and possibly Flm_mul. Thanks, Peter
commit 6615a44670102192c1c5818cd9d44bd01de54798
Author: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
Date: Sat May 2 01:08:09 2015 +0200
implement gen_matmul_sw
diff --git a/src/basemath/alglin1.c b/src/basemath/alglin1.c
index 646a8e9..c354596 100644
--- a/src/basemath/alglin1.c
+++ b/src/basemath/alglin1.c
@@ -389,11 +389,212 @@ gen_matcolmul(GEN A, GEN B, void *E, const struct bb_field *ff)
return gen_matcolmul_i(A, B, lgA, lgcols(A), E, ff);
}
+static GEN
+gen_matmul_classical(GEN A, GEN B, long l, long la, long lb,
+ void *E, const struct bb_field *ff)
+{
+ long j;
+ GEN C = cgetg(lb, t_MAT);
+ for(j = 1; j < lb; j++)
+ gel(C, j) = gen_matcolmul_i(A, gel(B, j), la, l, E, ff);
+ return C;
+}
+
+/* Strassen-Winograd algorithm */
+
+/*
+ Return A[ma+1..ma+da, na+1..na+ea] - B[mb+1..mb+db, nb+1..nb+eb]
+ as an (m x n)-matrix, padding the input with zeroes as necessary.
+*/
+static GEN
+add_slices(long m, long n,
+ GEN A, long ma, long da, long na, long ea,
+ GEN B, long mb, long db, long nb, long eb,
+ void *E, const struct bb_field *ff)
+{
+ long min_d = minss(da, db), min_e = minss(ea, eb), i, j;
+ GEN M = cgetg(n + 1, t_MAT), C;
+
+ for (j = 1; j <= min_e; j++) {
+ gel(M, j) = C = cgetg(m + 1, t_COL);
+ for (i = 1; i <= min_d; i++)
+ gel(C, i) = ff->add(E, gcoeff(A, ma + i, na + j),
+ gcoeff(B, mb + i, nb + j));
+ for (; i <= da; i++)
+ gel(C, i) = gcoeff(A, ma + i, na + j);
+ for (; i <= db; i++)
+ gel(C, i) = gcoeff(B, mb + i, nb + j);
+ for (; i <= m; i++)
+ gel(C, i) = ff->s(E, 0);
+ }
+ for (; j <= ea; j++) {
+ gel(M, j) = C = cgetg(m + 1, t_COL);
+ for (i = 1; i <= da; i++)
+ gel(C, i) = gcoeff(A, ma + i, na + j);
+ for (; i <= m; i++)
+ gel(C, i) = ff->s(E, 0);
+ }
+ for (; j <= eb; j++) {
+ gel(M, j) = C = cgetg(m + 1, t_COL);
+ for (i = 1; i <= db; i++)
+ gel(C, i) = gcoeff(B, mb + i, nb + j);
+ for (; i <= m; i++)
+ gel(C, i) = ff->s(E, 0);
+ }
+ for (; j <= n; j++) {
+ gel(M, j) = C = cgetg(m + 1, t_COL);
+ for (i = 1; i <= m; i++)
+ gel(C, i) = ff->s(E, 0);
+ }
+ return M;
+}
+
+/*
+ Return A[ma+1..ma+da, na+1..na+ea] - B[mb+1..mb+db, nb+1..nb+eb]
+ as an (m x n)-matrix, padding the input with zeroes as necessary.
+*/
+static GEN
+subtract_slices(long m, long n,
+ GEN A, long ma, long da, long na, long ea,
+ GEN B, long mb, long db, long nb, long eb,
+ void *E, const struct bb_field *ff)
+{
+ long min_d = minss(da, db), min_e = minss(ea, eb), i, j;
+ GEN M = cgetg(n + 1, t_MAT), C;
+
+ for (j = 1; j <= min_e; j++) {
+ gel(M, j) = C = cgetg(m + 1, t_COL);
+ for (i = 1; i <= min_d; i++)
+ gel(C, i) = ff->add(E, gcoeff(A, ma + i, na + j),
+ ff->neg(E, gcoeff(B, mb + i, nb + j)));
+ for (; i <= da; i++)
+ gel(C, i) = gcoeff(A, ma + i, na + j);
+ for (; i <= db; i++)
+ gel(C, i) = ff->neg(E, gcoeff(B, mb + i, nb + j));
+ for (; i <= m; i++)
+ gel(C, i) = ff->s(E, 0);
+ }
+ for (; j <= ea; j++) {
+ gel(M, j) = C = cgetg(m + 1, t_COL);
+ for (i = 1; i <= da; i++)
+ gel(C, i) = gcoeff(A, ma + i, na + j);
+ for (; i <= m; i++)
+ gel(C, i) = ff->s(E, 0);
+ }
+ for (; j <= eb; j++) {
+ gel(M, j) = C = cgetg(m + 1, t_COL);
+ for (i = 1; i <= db; i++)
+ gel(C, i) = ff->neg(E, gcoeff(B, mb + i, nb + j));
+ for (; i <= m; i++)
+ gel(C, i) = ff->s(E, 0);
+ }
+ for (; j <= n; j++) {
+ gel(M, j) = C = cgetg(m + 1, t_COL);
+ for (i = 1; i <= m; i++)
+ gel(C, i) = ff->s(E, 0);
+ }
+ return M;
+}
+
+static GEN gen_matmul_i(GEN A, GEN B, long l, long la, long lb,
+ void *E, const struct bb_field *ff);
+
+static GEN
+gen_matmul_sw(GEN A, GEN B, long m, long n, long p,
+ void *E, const struct bb_field *ff)
+{
+ pari_sp av = avma;
+ long m1 = (m + 1)/2, m2 = m/2,
+ n1 = (n + 1)/2, n2 = n/2,
+ p1 = (p + 1)/2, p2 = p/2;
+ GEN A11, A12, A22, B11, B21, B22,
+ S1, S2, S3, S4, T1, T2, T3, T4,
+ M1, M2, M3, M4, M5, M6, M7,
+ V1, V2, V3, C11, C12, C21, C22, C;
+
+ T2 = subtract_slices(n1, p2, B, 0, n1, p1, p2, B, n1, n2, p1, p2, E, ff);
+ S1 = subtract_slices(m2, n1, A, m1, m2, 0, n1, A, 0, m2, 0, n1, E, ff);
+ M2 = gen_matmul_i(S1, T2, m2 + 1, n1 + 1, p2 + 1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 2, &T2, &M2); /* destroy S1 */
+ T3 = subtract_slices(n1, p1, T2, 0, n1, 0, p2, B, 0, n1, 0, p1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 2, &M2, &T3); /* destroy T2 */
+ S2 = add_slices(m2, n1, A, m1, m2, 0, n1, A, m1, m2, n1, n2, E, ff);
+ T1 = subtract_slices(n1, p1, B, 0, n1, p1, p2, B, 0, n1, 0, p2, E, ff);
+ M3 = gen_matmul_i(S2, T1, m2 + 1, n1 + 1, p2 + 1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 4, &M2, &T3, &S2, &M3); /* destroy T1 */
+ S3 = subtract_slices(m1, n1, S2, 0, m2, 0, n1, A, 0, m1, 0, n1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 4, &M2, &T3, &M3, &S3); /* destroy S2 */
+ A11 = matslice(A, 1, m1, 1, n1);
+ B11 = matslice(B, 1, n1, 1, p1);
+ M1 = gen_matmul_i(A11, B11, m1 + 1, n1 + 1, p1 + 1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 5, &M2, &T3, &M3, &S3, &M1); /* destroy A11, B11 */
+ A12 = matslice(A, 1, m1, n1 + 1, n);
+ B21 = matslice(B, n1 + 1, n, 1, p1);
+ M4 = gen_matmul_i(A12, B21, m1 + 1, n2 + 1, p1 + 1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 6, &M2, &T3, &M3, &S3, &M1, &M4); /* destroy A12, B21 */
+ C11 = add_slices(m1, p1, M1, 0, m1, 0, p1, M4, 0, m1, 0, p1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 6, &M2, &T3, &M3, &S3, &M1, &C11); /* destroy M4 */
+ M5 = gen_matmul_i(S3, T3, m1 + 1, n1 + 1, p1 + 1, E, ff);
+ S4 = subtract_slices(m1, n2, A, 0, m1, n1, n2, S3, 0, m1, 0, n2, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 7, &M2, &T3, &M3, &M1, &C11, &M5, &S4); /* destroy S3 */
+ T4 = add_slices(n2, p1, B, n1, n2, 0, p1, T3, 0, n2, 0, p1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 7, &M2, &M3, &M1, &C11, &M5, &S4, &T4); /* destroy T3 */
+ V1 = subtract_slices(m1, p1, M1, 0, m1, 0, p1, M5, 0, m1, 0, p1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 6, &M2, &M3, &S4, &T4, &C11, &V1); /* destroy M1, M5 */
+ B22 = matslice(B, n1 + 1, n, p1 + 1, p);
+ M6 = gen_matmul_i(S4, B22, m1 + 1, n2 + 1, p2 + 1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 6, &M2, &M3, &T4, &C11, &V1, &M6); /* destroy S4, B22 */
+ A22 = matslice(A, m1 + 1, m, n1 + 1, n);
+ M7 = gen_matmul_i(A22, T4, m2 + 1, n2 + 1, p1 + 1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 6, &M2, &M3, &C11, &V1, &M6, &M7); /* destroy A22, T4 */
+ V3 = add_slices(m1, p2, V1, 0, m1, 0, p2, M3, 0, m2, 0, p2, E, ff);
+ C12 = add_slices(m1, p2, V3, 0, m1, 0, p2, M6, 0, m1, 0, p2, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 6, &M2, &M3, &C11, &V1, &M7, &C12); /* destroy V3, M6 */
+ V2 = add_slices(m2, p1, V1, 0, m2, 0, p1, M2, 0, m2, 0, p2, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 5, &M3, &C11, &M7, &C12, &V2); /* destroy V1, M2 */
+ C21 = add_slices(m2, p1, V2, 0, m2, 0, p1, M7, 0, m2, 0, p1, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 5, &M3, &C11, &C12, &V2, &C21); /* destroy M7 */
+ C22 = add_slices(m2, p2, V2, 0, m2, 0, p2, M3, 0, m2, 0, p2, E, ff);
+ if (gc_needed(av, 1))
+ gerepileall(av, 4, &C11, &C12, &C21, &C22); /* destroy V2, M3 */
+ C = mkmat2(mkcol2(C11, C21), mkcol2(C12, C22));
+ return gerepileupto(av, matconcat(C));
+}
+
+/* Strassen-Winograd used for dim >= gen_matmul_sw_bound */
+static const long gen_matmul_sw_bound = 24;
+
+static GEN
+gen_matmul_i(GEN A, GEN B, long l, long la, long lb,
+ void *E, const struct bb_field *ff)
+{
+ if (l <= gen_matmul_sw_bound
+ || la <= gen_matmul_sw_bound
+ || lb <= gen_matmul_sw_bound)
+ return gen_matmul_classical(A, B, l, la, lb, E, ff);
+ else
+ return gen_matmul_sw(A, B, l - 1, la - 1, lb - 1, E, ff);
+}
+
GEN
gen_matmul(GEN A, GEN B, void *E, const struct bb_field *ff)
{
- ulong j, l, lgA, lgB = lg(B);
- GEN C;
+ ulong lgA, lgB = lg(B);
if (lgB == 1)
return cgetg(1, t_MAT);
lgA = lg(A);
@@ -401,11 +602,7 @@ gen_matmul(GEN A, GEN B, void *E, const struct bb_field *ff)
pari_err_OP("operation 'gen_matmul'", A, B);
if (lgA == 1)
return zeromat(0, lgB - 1);
- l = lgcols(A);
- C = cgetg(lgB, t_MAT);
- for(j = 1; j < lgB; j++)
- gel(C, j) = gen_matcolmul_i(A, gel(B, j), lgA, l, E, ff);
- return C;
+ return gen_matmul_i(A, B, lgcols(A), lgA, lgB, E, ff);
}
static GEN
diff --git a/src/test/32/ff b/src/test/32/ff
index df1b630..2f03fc2 100644
--- a/src/test/32/ff
+++ b/src/test/32/ff
@@ -296,14 +296,26 @@ t^4 + t^2
[3*t^2 + 3*t, 18446744073709551628*t^4 + 5*t^3 + 4*t^2 + 1844674407370955162
7*t + 18446744073709551628, 18446744073709551628*t^4 + 5*t^3 + 4*t^2 + 18446
744073709551628*t]~
-? test(q)=my(t=ffgen(q,'t),M=matrix(10,10,i,j,random(t)));subst(charpoly(M),'x,M)==0;
-? test(nextprime(2^7)^5)
+? test(q,n)=my(t=ffgen(q,'t),M=matrix(n,n,i,j,random(t)));subst(charpoly(M),'x,M)==0;
+? test(nextprime(2^7)^5,10)
1
-? test(nextprime(2^15)^5)
+? test(nextprime(2^15)^5,10)
1
-? test(nextprime(2^31)^5)
+? test(nextprime(2^31)^5,10)
1
-? test(nextprime(2^63)^5)
+? test(nextprime(2^63)^5,10)
+1
+? test(nextprime(2^80)^2,10)
+1
+? test(nextprime(2^7)^5,27)
+1
+? test(nextprime(2^15)^5,27)
+1
+? test(nextprime(2^31)^5,27)
+1
+? test(nextprime(2^63)^5,27)
+1
+? test(nextprime(2^80)^2,27)
1
? print("Total time spent: ",gettime);
-Total time spent: 1440
+Total time spent: 1959
diff --git a/src/test/in/ff b/src/test/in/ff
index 8ed968c..4d07a4e 100644
--- a/src/test/in/ff
+++ b/src/test/in/ff
@@ -88,11 +88,17 @@ test(2^5)
test(7^5)
test((2^64+13)^5)
-test(q)={
- my(t = ffgen(q, 't), M = matrix(10, 10, i, j, random(t)));
+test(q, n)={
+ my(t = ffgen(q, 't), M = matrix(n, n, i, j, random(t)));
subst(charpoly(M), 'x, M) == 0;
}
-test(nextprime(2^7)^5)
-test(nextprime(2^15)^5)
-test(nextprime(2^31)^5)
-test(nextprime(2^63)^5)
+test(nextprime(2^7)^5, 10)
+test(nextprime(2^15)^5, 10)
+test(nextprime(2^31)^5, 10)
+test(nextprime(2^63)^5, 10)
+test(nextprime(2^80)^2, 10)
+test(nextprime(2^7)^5, 27)
+test(nextprime(2^15)^5, 27)
+test(nextprime(2^31)^5, 27)
+test(nextprime(2^63)^5, 27)
+test(nextprime(2^80)^2, 27)