Heat Equation¶
Python Numpy¶
from __future__ import print_function
import numpy as np
from benchpress.benchmarks import util
bench = util.Benchmark("Solving the heat equation using the jacobi method", "height*width*iterations")
def init_grid(height, width, dtype=np.float32):
grid = np.zeros((height + 2, width + 2), dtype=dtype)
grid[:, 0] = dtype(-273.15)
grid[:, -1] = dtype(-273.15)
grid[-1, :] = dtype(-273.15)
grid[0, :] = dtype(40.0)
return grid
def jacobi(grid, epsilon=0.005, max_iterations=None):
def loop_body(grid):
center = grid[1:-1, 1:-1]
north = grid[0:-2, 1:-1]
east = grid[1:-1, 2:]
west = grid[1:-1, 0:-2]
south = grid[2:, 1:-1]
work = 0.2 * (center + north + east + west + south)
delta = np.sum(np.absolute(work - center))
center[:] = work
bench.plot_surface(grid, "2d", 0, 200, -200)
return delta > epsilon
bench.do_while(loop_body, max_iterations, grid)
return grid
def main():
H = bench.args.size[0]
W = bench.args.size[1]
I = bench.args.size[2]
grid = bench.load_data()
if grid is not None:
grid = grid['grid']
else:
grid = init_grid(H, W, dtype=bench.dtype)
bench.start()
grid = jacobi(grid, max_iterations=I)
bench.stop()
bench.save_data({'grid': grid})
bench.pprint()
if __name__ == "__main__":
main()
C99 Omp¶
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <bp_util.h>
/*
Without collapsing
void solve(int height, int width, double *grid, double epsilon, int max_iterations)
{
double *T = (double*)malloc(height*width*sizeof(double));
double delta = epsilon+1.0;
int iterations = 0;
while(delta>epsilon) {
++iterations;
#pragma omp parallel for shared(grid, T) reduction(+:delta)
for(int i=0; i<height-2; ++i)
{
int a = i * width;
const double *up = &grid[a+1];
const double *left = &grid[a+width];
const double *right = &grid[a+width+2];
const double *down = &grid[a+1+width*2];
const double *center = &grid[a+width+1];
double *t_center = &T[a+width+1];
double delta_local = 0;
for(int j=0; j<width-2; ++j)
{
*t_center = (*center + *up + *left + *right + *down) * 0.2;
delta_local += fabs(*t_center - *center);
center++;up++;left++;right++;down++;t_center++;
}
delta += delta_local;
}
#pragma omp parallel for shared(grid, T)
for(int i=0; i<height-2; ++i)
{
int a = i * width;
const double *center = &grid[a+width+1];
double *t_center = &T[a+width+1];
for(int j=0; j<width-2; ++j)
{
*t_center = *center;
++t_center;
++center;
}
}
if (iterations>=max_iterations) {
break;
}
}
free(T);
}*/
void solve(int height, int width, double *grid, double epsilon, int max_iterations)
{
double *T = (double*)malloc(height*width*sizeof(double));
double delta = epsilon+1.0;
int iterations = 0;
while (delta>epsilon) {
++iterations;
#pragma omp parallel for reduction(+:delta) collapse(2)
for (int i=1; i<height-1; ++i) {
for (int j=1; j<width-1; ++j) {
const int a = i * width + j;
const double *up = &grid[a-width];
const double *left = &grid[a-1];
const double *right = &grid[a+1];
const double *down = &grid[a+width];
const double *center = &grid[a];
double *t_center = &T[a];
*t_center = (*up + *down + *center + *left + *right) * 0.2;
delta += fabs(*t_center - *center);
}
}
#pragma omp parallel for collapse(2)
for (int i=1; i<height-1; ++i) {
for (int j=1; j<width-1; ++j) {
const int a = i * width + j;
const double *center = &grid[a];
double *t_center = &T[a];
*t_center = *center;
}
}
if (iterations>=max_iterations) {
break;
}
}
free(T);
}
int main (int argc, char **argv)
{
bp_util_type bp = bp_util_create(argc, argv, 3);
if (bp.args.has_error) {
return 1;
}
const int height = bp.args.sizes[0];
const int width = bp.args.sizes[1];
const int iter = bp.args.sizes[2];
double epsilon = 0.005;
size_t grid_size = height*width*sizeof(double);
double *grid = (double*)malloc(grid_size);
//
// NumaEffects - begin
//
// memset(grid, 0, grid_size); // <--- bad idea.
// memset is sequentiel and will touch the entire
// grid on one numa-node.
//
// Instead of memset, parallel initialization
// is performed with the following loop construct:
//
double* grid_i = grid;
#pragma omp parallel for collapse(2)
for (int i=0; i<height; ++i) {
for (int j=0; j<width; ++j) {
*grid_i = 0;
++grid_i;
}
}
//
// NumaEffects - end
for (int j=0; j<height; j++) {
grid[j*width] = -273.15;
grid[j*width+width-1] = -273.15;
}
for (int j=0; j<width; j++) {
grid[j+(height-1)*width] = -273.15;
grid[j] = 40.0;
}
bp.timer_start();
solve(height, width, grid, epsilon, iter);
bp.timer_stop();
bp.print("heat_equation(c99_omp)");
free(grid);
return 0;
}
C99 Omp Mpi¶
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <bp_util.h>
inline double innerloop(const double* grid, double* T, int width, int i)
{
int a = i * width;
const double *up = &grid[a+1];
const double *left = &grid[a+width];
const double *right = &grid[a+width+2];
const double *down = &grid[a+1+width*2];
const double *center = &grid[a+width+1];
double *t_center = &T[a+width+1];
double delta = 0;
for(int j=0; j<width-2; ++j)
{
*t_center = (*center + *up + *left + *right + *down) * 0.2;
delta += fabs(*t_center - *center);
center++;up++;left++;right++;down++;t_center++;
}
return delta;
}
void openmp(int height, int width, double *grid, int iter)
{
double *T = (double*)malloc(height*width*sizeof(double));
memcpy(T, grid, height*width*sizeof(double));
for(int n=0; n<iter; ++n)
{
double delta=0;
#pragma omp parallel for shared(grid,T) reduction(+:delta)
for(int i=0; i<height-2; ++i)
{
delta += innerloop(grid, T, width, i);
}
memcpy(grid, T, height*width*sizeof(double));
}
free(T);
}
void openmp_mpi(int height, int width, double *grid, int iter)
{
int myrank, worldsize;
MPI_Comm comm;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
int periods[] = {0};
MPI_Cart_create(MPI_COMM_WORLD, 1, &worldsize,
periods, 1, &comm);
double *T = (double*)malloc(height*width*sizeof(double));
memcpy(T, grid, height*width*sizeof(double));
for(int n=0; n<iter; ++n)
{
int p_src, p_dest;
//Send/receive - neighbor above
MPI_Cart_shift(comm,0,1,&p_src,&p_dest);
MPI_Sendrecv(grid+width,width,MPI_DOUBLE,
p_dest,1,
grid,width, MPI_DOUBLE,
p_src,1,comm,MPI_STATUS_IGNORE);
//Send/receive - neighbor below
MPI_Cart_shift(comm,0,-1,&p_src,&p_dest);
MPI_Sendrecv(grid+(height-2)*width, width,MPI_DOUBLE,
p_dest,1,
grid+(height-1)*width,
width,MPI_DOUBLE,
p_src,1,comm,MPI_STATUS_IGNORE);
double delta=0, global_delta;
#pragma omp parallel for shared(grid,T) reduction(+:delta)
for(int i=0; i<height-2; ++i)
{
delta += innerloop(grid, T, width, i);
}
memcpy(grid, T, height*width*sizeof(double));
MPI_Allreduce(&global_delta, &delta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
free(T);
}
int main (int argc, char **argv)
{
MPI_Init(&argc,&argv);
int myrank, worldsize;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
bp_util_type bp = bp_util_create(argc, argv, 3);
if (bp.args.has_error) {
return 1;
}
int height = bp.args.sizes[0];
const int width = bp.args.sizes[1];
const int iter = bp.args.sizes[2];
//Local vertical size. NB: the horizontal size is always the full grid including borders
height = (width-2) / worldsize;
if(myrank == worldsize-1)
height += (width-2) % worldsize;
height += 2;//Add a ghost line above and below
double *grid = malloc(height*width*sizeof(double));
for(int j=0; j<width;j++)
{
if(myrank == 0)
grid[j] = 40.0;
if(myrank == worldsize-1)
grid[j+(height-1)*width] = -273.15;
}
for(int j=1; j<height-1;j++)
{
grid[j*width] = -273.15;
grid[j*width+width-1]= -273.15;
}
MPI_Barrier(MPI_COMM_WORLD);
bp.timer_start();
openmp(height,width,grid,iter);
MPI_Barrier(MPI_COMM_WORLD);
bp.timer_stop();
if (myrank == 0) {
bp.print("heat_equation(c99_omp_mpi)");
}
free(grid);
MPI_Finalize();
return 0;
}
C99 Seq¶
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <bp_util.h>
void sequential(int height, int width, double *grid, int iter)
{
double *T = (double*)malloc(height*width*sizeof(double));
memcpy(T, grid, height*width*sizeof(double));
for(int n=0; n<iter; ++n)
{
double *a = grid;
double *t = T;
double delta=0;
for(int i=1; i<height-1; ++i)
{
double *up = a+1;
double *left = a+width;
double *right = a+width+2;
double *down = a+1+width*2;
double *center = a+width+1;
double *t_center = t+width+1;
for(int j=0; j<width-2; ++j)
{
*t_center = (*center + *up + *left + *right + *down) * 0.2;
delta += fabs(*t_center - *center);
center++;up++;left++;right++;down++;t_center++;
}
a += width;
t += width;
}
#ifdef DEBUG
printf("Delta: %lf\n",delta);
#endif
memcpy(grid, T, height*width*sizeof(double));
}
free(T);
}
int main (int argc, char **argv)
{
bp_util_type bp = bp_util_create(argc, argv, 3);// Grab arguments
if (bp.args.has_error) {
return 1;
}
const int height = bp.args.sizes[0];
const int width = bp.args.sizes[1];
const int iter = bp.args.sizes[2];
size_t grid_size = height*width*sizeof(double);
double *grid = (double*)malloc(grid_size);
memset(grid, 0, grid_size);
for(int j=0; j<width;j++)
{
grid[j] = 40.0;
grid[j+(height-1)*width] = -273.15;
}
for(int j=1; j<height-1;j++)
{
grid[j*width] = -273.15;
grid[j*width+width-1]= -273.15;
}
#ifdef DEBUG
for (int i = 0; i<height; i++)
{
for(int j=0; j<width;j++)
{
printf ("%lf ", grid[j+i*width]);
}
printf ("\n");
}
#endif
bp.timer_start();
sequential(height,width,grid,iter);
bp.timer_stop();
#ifdef DEBUG
for (int i = 0; i<height; i++)
{
for(int j=0; j<width;j++)
{
printf ("%lf ", grid[j+i*width]);
}
printf ("\n");
}
#endif
bp.print("heat_equation(c99_seq)");
free(grid);
return 0;
}
Cpp11 Omp¶
#include <iostream>
#include <sstream>
#include <string>
#include <cmath>
#include <bp_util.h>
#define xdim 14000
#define ydim 14000
using namespace std;
template <typename T>
void solve(T grid, double epsilon, int max_iterations)
{
T temp = new double[ydim][xdim];
double delta = epsilon+1.0;
int iterations = 0; // Compute the heat equation
while(delta>epsilon) {
++iterations;
#pragma omp parallel for reduction(+:delta) collapse(2)
for (int i=1; i<ydim-1; i++) {
for(int j=1;j<xdim-1;j++) {
temp[i][j] = (grid[i-1][j] + grid[i+1][j] + grid[i][j] + grid[i][j-1] + grid[i][j+1])*0.2;
delta += abs(temp[i][j] - grid[i][j]);
}
}
#pragma omp parallel for collapse(2)
for (int i=1;i<ydim-1; i++) {
for(int j=1;j<xdim-1;j++) {
grid[i][j] = temp[i][j];
}
}
if (iterations>=max_iterations) {
break;
}
}
}
int main(int argc, char* argv[])
{
bp_util_type bp = bp_util_create(argc, argv, 3);// Grab arguments
if (bp.args.has_error) {
return 1;
}
bp.timer_start();
const int width = bp.args.sizes[0];
const int height = bp.args.sizes[1];
if (ydim != height) {
cout << "Multidimensional arrays in C11 does not support dynamic size, so it has to be: " << ydim << "." << endl;
return 0;
}
if (xdim != width) {
cout << "Multidimensional arrays in C11 does not support dynamic size, so it has to be: " << xdim << "." << endl;
return 0;
}
const int max_iterations = bp.args.sizes[2];
double epsilon = 0.005;
auto grid = new double[ydim][xdim];
#pragma omp parallel for collapse(2)
for (int i=0; i<ydim; i++) { // Initialize the grid
for (int j=0;j<xdim;j++) {
grid[i][j] = 0;
}
}
for (int i=0; i<ydim; i++) { // And borders
grid[i][0] = -273.15;
grid[i][xdim-1] = -273.15;
}
for (int i=0; i<xdim; i++) {
grid[0][i] = -273.15;
grid[ydim-1][i] = 40.0;
}
bp.timer_start(); // Start timer
solve(grid, epsilon, max_iterations);
bp.timer_stop();
bp.print("heat_equation(cpp11_omp)");
if (bp.args.verbose) { // and values.
cout << ", \"output\": [";
for (int i=0; i<10; ++i) {
for (int j=0; j<10; ++j) {
cout << grid[i][j] << ", ";
}
cout << endl;
}
cout << "]";
}
return 0;
}
Cpp11 Opencl¶
Error
There are issues with the implementation.
Two known issues:
* Implementation compiles (with warning) but execution is untested.
* Implementation does not use ``bp-util`` for argparsing and timing, getting it to run in a suite might be cumbersome...
#include <iostream>
#include <sys/time.h>
#include <CL/cl.hpp>
#include <stdexcept>
#include <sys/time.h>
#include <cstdio>
#include <fstream>
#include <sstream>
#include <utility>
cl::Context context;
std::vector<cl::Device> devices;
cl::CommandQueue commandQueue;
cl::Kernel kernel;
#define TPB 32
#ifndef DTYPE
#define DTYPE double
#endif
#define STR(s) #s
#define xSTR(s) STR(s)
int main (int argc, char** argv)
{
if (argc != 5)
{
std::cout << "Usage: " << argv[0] << " HEAT-EQ-KERNEL-FILE WIDTH HEIGHT ITERATIONS" << std::endl;
return 0;
}
// Commandline arguments
char *opencl_file = argv[1];
unsigned int width = atoi(argv[2]);
unsigned int height = atoi(argv[3]);
unsigned int iter = atoi(argv[4]);
// Setup OpenCL execution system
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
bool foundPlatform = false;
std::vector<cl::Platform>::iterator it;
for (it = platforms.begin(); it != platforms.end(); ++it)
{
try {
cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*it)(),0};
context = cl::Context(CL_DEVICE_TYPE_GPU, props);
foundPlatform = true;
break;
}
catch (cl::Error e)
{
foundPlatform = false;
}
}
if (foundPlatform)
{
devices = context.getInfo<CL_CONTEXT_DEVICES>();
commandQueue = cl::CommandQueue(context,devices[0],0);
} else {
throw std::runtime_error("Could not find valid OpenCL platform.");
}
// Read source file and compile code
std::ifstream file(opencl_file, std::ios::in);
if (!file.is_open())
{
throw std::runtime_error("Could not open source file.");
}
std::ostringstream srcstrs;
srcstrs << "#define DTYPE " << xSTR(DTYPE) << "\n";
srcstrs << file.rdbuf();
std::string srcstr = srcstrs.str();
#ifdef DEBUG
std::cout << srcstr << std::endl;
#endif
cl::Program::Sources source(1,std::make_pair(srcstr.c_str(),0));
cl::Program program(context, source);
try {program.build(devices);}
catch (cl::Error e)
{
std::cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl;
}
kernel = cl::Kernel(program, "heat_eq_jacobi");
// Set up model
unsigned int w = width+2;
unsigned int h = height+2;
size_t grid_size = w*h*sizeof(DTYPE);
size_t delta_size = w*sizeof(DTYPE);
DTYPE* grid = (DTYPE*)malloc(grid_size);
memset(grid, 0, grid_size);
for(int j=0; j<w;j++)
{
grid[j] = 40.0;
grid[j+(h-1)*w] = -273.15;
}
for(int j=1; j<h-1;j++)
{
grid[j*w] = -273.15;
grid[j*w+w-1]= -273.15;
}
#ifdef DEBUG
for (int i = 0; i<h; i++)
{
for(int j=0; j<w;j++)
{
printf ("%lf ", grid[j+i*w]);
}
printf ("\n");
}
#endif
// Start timing
timeval t_start,t_end;
gettimeofday(&t_start,NULL);
// Set up buffers and copy data
cl::Buffer inDev = cl::Buffer(context, CL_MEM_READ_WRITE, grid_size, NULL);
cl::Buffer outDev = cl::Buffer(context, CL_MEM_READ_WRITE, grid_size, NULL);
cl::Buffer deltaDev = cl::Buffer(context, CL_MEM_READ_WRITE, delta_size, NULL);
cl::Buffer deltaHost = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, delta_size, NULL);
DTYPE* deltaHostPtr = (DTYPE*)commandQueue.enqueueMapBuffer(deltaHost, CL_TRUE, CL_MAP_WRITE, 0, delta_size);
commandQueue.enqueueWriteBuffer(inDev, CL_FALSE, 0, grid_size, grid);
commandQueue.enqueueCopyBuffer(inDev, outDev, 0, 0, grid_size);
// Iterating
for(int n=0; n<iter; ++n)
{
kernel.setArg(0,width);
kernel.setArg(1,height);
kernel.setArg(2,inDev);
kernel.setArg(3,outDev);
kernel.setArg(4,deltaDev);
commandQueue.enqueueNDRangeKernel(kernel, cl::NullRange,
cl::NDRange((width/TPB+1)*TPB), cl::NDRange(TPB));
commandQueue.enqueueReadBuffer(deltaDev, CL_FALSE, 0, delta_size, deltaHostPtr);
commandQueue.finish();
DTYPE delta = 0.0;
for (unsigned int i = 0; i < width; ++i)
delta += deltaHostPtr[i];
#ifdef DEBUG
std::cout << "Delta: " << delta << std::endl;
#endif
cl::Buffer tmp = inDev;
inDev = outDev;
outDev = tmp;
}
commandQueue.enqueueReadBuffer(inDev, CL_TRUE, 0, grid_size, grid);
gettimeofday(&t_end,NULL);
#ifdef DEBUG
for (int i = 0; i<h; i++)
{
for(int j=0; j<w;j++)
{
printf ("%lf ", grid[j+i*w]);
}
printf ("\n");
}
#endif
double d_time = (t_end.tv_sec - t_start.tv_sec) + (t_end.tv_usec - t_start.tv_usec)/1000000.0;
std::cout << argv[0] << " - iter: " << iter << " size: " << width << "*" << height << " elapsed-time: " <<
d_time << std::endl;
return 0;
}
Csharp Numcil¶
#region Copyright
/*
This file is part of Bohrium and copyright (c) 2012 the Bohrium:
team <http://www.bh107.org>.
Bohrium is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3
of the License, or (at your option) any later version.
Bohrium is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the
GNU Lesser General Public License along with Bohrium.
If not, see <http://www.gnu.org/licenses/>.
*/
#endregion
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using R = NumCIL.Range;
namespace HeatEquation
{
using NumCIL.Double;
using T = System.Double;
public static class HeatEquationSolverDouble
{
public static NdArray Create(long width, long height)
{
var res = Generate.Zeroes(height + 2, width + 2);
res[R.All, R.El(0)] = -273.5f;
res[R.All, R.El(-1)] = -273.5f;
res[R.El(0), R.All] = 40f;
res[R.El(-1), R.All] = -273.5f;
return res;
}
public static Tuple<int, NdArray> Solve(NdArray full, long? fixedIterations = null)
{
var center = full[R.Slice(1, -1), R.Slice(1, -1) ];
var north = full[R.Slice(1, -1), R.Slice(0, -2) ];
var east = full[R.Slice(0, -2), R.Slice(1, -1) ];
var west = full[R.Slice(2, 0), R.Slice(1, -1) ];
var south = full[R.Slice(1, -1), R.Slice(2, 0) ];
if (fixedIterations != null)
{
for(var i =0; i < fixedIterations.Value; i++)
center[R.All] = 0.2f * (center + north + east + west + south);
return new Tuple<int, NdArray>((int)fixedIterations.Value, full);
}
else
{
T epsilon = 0.005f;
T delta = epsilon + 1;
int iteration = 0;
while (epsilon < delta)
{
iteration++;
var work = 0.2f * (center + north + east + west + south);
delta = (work - center).Abs().Sum();
center[R.All] = work;
}
return new Tuple<int, NdArray>(iteration, full);
}
}
}
}
namespace HeatEquation
{
using NumCIL.Float;
using T = System.Single;
public static class HeatEquationSolverSingle
{
public static NdArray Create(long width, long height)
{
var res = Generate.Zeroes(height + 2, width + 2);
res[R.All, R.El(0)] = -273.5f;
res[R.All, R.El(-1)] = -273.5f;
res[R.El(0), R.All] = 40f;
res[R.El(-1), R.All] = -273.5f;
return res;
}
public static Tuple<int, NdArray> Solve(NdArray full, long? fixedIterations = null)
{
var center = full[R.Slice(1, -1), R.Slice(1, -1) ];
var north = full[R.Slice(1, -1), R.Slice(0, -2) ];
var east = full[R.Slice(0, -2), R.Slice(1, -1) ];
var west = full[R.Slice(2, 0), R.Slice(1, -1) ];
var south = full[R.Slice(1, -1), R.Slice(2, 0) ];
if (fixedIterations != null)
{
for(var i =0; i < fixedIterations.Value; i++)
center[R.All] = 0.2f * (center + north + east + west + south);
return new Tuple<int, NdArray>((int)fixedIterations.Value, full);
}
else
{
T epsilon = 0.005f;
T delta = epsilon + 1;
int iteration = 0;
while (epsilon < delta)
{
iteration++;
var work = 0.2f * (center + north + east + west + south);
delta = (work - center).Abs().Sum();
center[R.All] = work;
}
return new Tuple<int, NdArray>(iteration, full);
}
}
}
}