Performance bug in standard formulation of stencil
The following program contains a nice stencil function, and an explicit one. Defining SLOW
gives the former, not defining anything the latter. On my machine (compiled with sac2c_p -maxwlur 9
) there is a difference between 27 GB/s and 3.8 GB/s. Profiling shows that the slow version does not vectorize, and that there is some memory management in the tight loop.
use StdIO: all;
use Math: all;
use Structures: all;
use Benchmarking: all;
double f(double x, double y)
{
return -2d * sin(x + y);
}
inline
double[d:shp] stencil(double[d:shp] x, double[d:wshp] w)
{
return {iv -> sum({jv -> w[jv] * x[mod(iv + jv - wshp / 2, shp)]})
| iv < shp};
}
inline
double[n, n] stencil_explicit(double[n, n] x, double[3] w)
{
return with {
([0, 0] <= iv < [n, n]): w[0] * x[iv] +
w[1] * (x[mod(iv - [1, 0], [n, n])] +
x[mod(iv + [1, 0], [n, n])] +
x[mod(iv + [0, 1], [n, n])] +
x[mod(iv - [0, 1], [n, n])]) +
w[2] * (x[mod(iv + [1, 1], [n, n])] +
x[mod(iv + [ 1, -1], [n, n])] +
x[mod(iv + [-1, 1], [n, n])] +
x[mod(iv + [-1, -1], [n, n])]);
}: genarray([n, n], 0d);
}
inline
double L2(double[*] x) { return sqrt(sum(x * x)); }
int main()
{
N = 16384;
ITER = 5;
i_init = getInterval("init", 2);
start(i_init);
pi = 4d * atan(1d);
a = 0d;
b = 2d * pi;
n = N;
h = (b - a) / tod(n);
F = {[i, j] -> f(a + tod(i) * h, a + tod(j) * h)
| [i, j] < [n, n]};
end(i_init);
time, unit = returnResultUnit(i_init);
printf("Initialisation took %lf %s\n", time, unit);
i_v = getInterval("v", 2);
start(i_v);
for (t = 0; t < ITER; t++) {
#ifdef SLOW
F = stencil(F, [[0d, 1d , 0d],
[1d, -4d, 1d],
[0d, 1d , 0d]]);
#else
F = stencil_explicit(F, [-4d, 1d, 0d]);
#endif
}
end(i_v);
time, unit = returnResultUnit(i_v);
printf("Bandwidth: %lf GB/s\n",
tod(N) * tod(N) * tod(ITER) * 3d * 8d / time * 1e-9);
printf("L2 norm: %e\n", L2(F));
return 0;
}