Heat Equation¶

Python Numpy¶

from __future__ import print_function
import numpy as np
from benchpress.benchmarks import util

bench = util.Benchmark("Solving the heat equation using the jacobi method", "height*width*iterations")


def init_grid(height, width, dtype=np.float32):
    grid = np.zeros((height + 2, width + 2), dtype=dtype)
    grid[:, 0] = dtype(-273.15)
    grid[:, -1] = dtype(-273.15)
    grid[-1, :] = dtype(-273.15)
    grid[0, :] = dtype(40.0)
    return grid


def jacobi(grid, epsilon=0.005, max_iterations=None):
    def loop_body(grid):
        center = grid[1:-1, 1:-1]
        north = grid[0:-2, 1:-1]
        east = grid[1:-1, 2:]
        west = grid[1:-1, 0:-2]
        south = grid[2:, 1:-1]
        work = 0.2 * (center + north + east + west + south)
        delta = np.sum(np.absolute(work - center))
        center[:] = work
        bench.plot_surface(grid, "2d", 0, 200, -200)
        return delta > epsilon

    bench.do_while(loop_body, max_iterations, grid)
    return grid


def main():
    H = bench.args.size[0]
    W = bench.args.size[1]
    I = bench.args.size[2]

    grid = bench.load_data()
    if grid is not None:
        grid = grid['grid']
    else:
        grid = init_grid(H, W, dtype=bench.dtype)

    bench.start()
    grid = jacobi(grid, max_iterations=I)
    bench.stop()
    bench.save_data({'grid': grid})
    bench.pprint()


if __name__ == "__main__":
    main()

C99 Omp¶

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <bp_util.h>

/*
Without collapsing
void solve(int height, int width, double *grid, double epsilon, int max_iterations)
{
    double *T = (double*)malloc(height*width*sizeof(double));

    double delta = epsilon+1.0;
    int iterations = 0;

    while(delta>epsilon) {
        ++iterations;

        #pragma omp parallel for shared(grid, T) reduction(+:delta)
        for(int i=0; i<height-2; ++i)
        {
            int a = i * width;
            const double *up     = &grid[a+1];
            const double *left   = &grid[a+width];
            const double *right  = &grid[a+width+2];
            const double *down   = &grid[a+1+width*2];
            const double *center = &grid[a+width+1];
            double *t_center = &T[a+width+1];

            double delta_local = 0;
            for(int j=0; j<width-2; ++j)
            {
                *t_center = (*center + *up + *left + *right + *down) * 0.2;
                delta_local += fabs(*t_center - *center);
                center++;up++;left++;right++;down++;t_center++;
            }
            delta += delta_local;
        }

        #pragma omp parallel for shared(grid, T)
        for(int i=0; i<height-2; ++i)
        {
            int a = i * width;
            const double *center = &grid[a+width+1];
            double *t_center = &T[a+width+1];

            for(int j=0; j<width-2; ++j)
            {
                *t_center = *center;
                ++t_center;
                ++center;
            }
        }

        if (iterations>=max_iterations) {
            break;
        }
    }
    free(T);
}*/

void solve(int height, int width, double *grid, double epsilon, int max_iterations)
{
    double *T = (double*)malloc(height*width*sizeof(double));

    double delta = epsilon+1.0;
    int iterations = 0;

    while (delta>epsilon) {
        ++iterations;

        #pragma omp parallel for reduction(+:delta) collapse(2)
        for (int i=1; i<height-1; ++i) {
            for (int j=1; j<width-1; ++j) {
                const int a = i * width + j;
                const double *up     = &grid[a-width];
                const double *left   = &grid[a-1];
                const double *right  = &grid[a+1];
                const double *down   = &grid[a+width];
                const double *center = &grid[a];
                double *t_center = &T[a];

                *t_center = (*up + *down + *center + *left + *right) * 0.2;
                delta += fabs(*t_center - *center);
            }
        }

        #pragma omp parallel for collapse(2)
        for (int i=1; i<height-1; ++i) {
            for (int j=1; j<width-1; ++j) {
                const int a = i * width + j;
                const double *center = &grid[a];
                double *t_center = &T[a];

                *t_center = *center;
            }
        }

        if (iterations>=max_iterations) {
            break;
        }
    }
    free(T);
}

int main (int argc, char **argv)
{
    bp_util_type bp = bp_util_create(argc, argv, 3);
    if (bp.args.has_error) {
        return 1;
    }
    const int height    = bp.args.sizes[0];
    const int width     = bp.args.sizes[1];
    const int iter      = bp.args.sizes[2];
    double epsilon = 0.005;

    size_t grid_size = height*width*sizeof(double);
    double *grid = (double*)malloc(grid_size);
    //
    // NumaEffects - begin
    //
    // memset(grid, 0, grid_size);  // <--- bad idea.
    // memset is sequentiel and will touch the entire
    // grid on one numa-node.
    //
    // Instead of memset, parallel initialization
    // is performed with the following loop construct:
    //
    double* grid_i = grid;
    #pragma omp parallel for collapse(2)
    for (int i=0; i<height; ++i) {
        for (int j=0; j<width; ++j) {
            *grid_i = 0;
            ++grid_i;
        }
    }
    //
    // NumaEffects - end
    for (int j=0; j<height; j++) {
        grid[j*width]           = -273.15;
        grid[j*width+width-1]   = -273.15;
    }
    for (int j=0; j<width; j++) {
        grid[j+(height-1)*width] = -273.15;
        grid[j]                  = 40.0;
    }

    bp.timer_start();
    solve(height, width, grid, epsilon, iter);
    bp.timer_stop();

    bp.print("heat_equation(c99_omp)");
    free(grid);
    return 0;
}

C99 Omp Mpi¶

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <bp_util.h>

inline double innerloop(const double* grid, double* T, int width, int i)
{
    int a = i * width;
    const double *up     = &grid[a+1];
    const double *left   = &grid[a+width];
    const double *right  = &grid[a+width+2];
    const double *down   = &grid[a+1+width*2];
    const double *center = &grid[a+width+1];
    double *t_center = &T[a+width+1];

    double delta = 0;
    for(int j=0; j<width-2; ++j)
    {
        *t_center = (*center + *up + *left + *right + *down) * 0.2;
        delta += fabs(*t_center - *center);
        center++;up++;left++;right++;down++;t_center++;
    }
    return delta;
}

void openmp(int height, int width, double *grid, int iter)
{
    double *T = (double*)malloc(height*width*sizeof(double));
    memcpy(T, grid, height*width*sizeof(double));
    for(int n=0; n<iter; ++n)
    {
        double delta=0;
        #pragma omp parallel for shared(grid,T) reduction(+:delta)
        for(int i=0; i<height-2; ++i)
        {
            delta += innerloop(grid, T, width, i);
        }
        memcpy(grid, T, height*width*sizeof(double));
    }
    free(T);
}

void openmp_mpi(int height, int width, double *grid, int iter)
{
    int myrank, worldsize;
    MPI_Comm comm;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    MPI_Comm_size(MPI_COMM_WORLD, &worldsize);

    int periods[] = {0};
    MPI_Cart_create(MPI_COMM_WORLD, 1, &worldsize,
                    periods, 1, &comm);

    double *T = (double*)malloc(height*width*sizeof(double));
    memcpy(T, grid, height*width*sizeof(double));
    for(int n=0; n<iter; ++n)
    {
        int p_src, p_dest;
        //Send/receive - neighbor above
        MPI_Cart_shift(comm,0,1,&p_src,&p_dest);
        MPI_Sendrecv(grid+width,width,MPI_DOUBLE,
                     p_dest,1,
                     grid,width, MPI_DOUBLE,
                     p_src,1,comm,MPI_STATUS_IGNORE);
        //Send/receive - neighbor below
        MPI_Cart_shift(comm,0,-1,&p_src,&p_dest);
        MPI_Sendrecv(grid+(height-2)*width, width,MPI_DOUBLE,
                     p_dest,1,
                     grid+(height-1)*width,
                     width,MPI_DOUBLE,
                     p_src,1,comm,MPI_STATUS_IGNORE);

        double delta=0, global_delta;
        #pragma omp parallel for shared(grid,T) reduction(+:delta)
        for(int i=0; i<height-2; ++i)
        {
            delta += innerloop(grid, T, width, i);
        }
        memcpy(grid, T, height*width*sizeof(double));
        MPI_Allreduce(&global_delta, &delta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    }
    free(T);
}

int main (int argc, char **argv)
{
    MPI_Init(&argc,&argv);
    int myrank, worldsize;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    MPI_Comm_size(MPI_COMM_WORLD, &worldsize);

    bp_util_type bp = bp_util_create(argc, argv, 3);
    if (bp.args.has_error) {
        return 1;
    }
    int height      = bp.args.sizes[0];
    const int width = bp.args.sizes[1];
    const int iter  = bp.args.sizes[2];

    //Local vertical size. NB: the horizontal size is always the full grid including borders
    height = (width-2) / worldsize;
    if(myrank == worldsize-1)
        height += (width-2) % worldsize;
    height += 2;//Add a ghost line above and below

    double *grid = malloc(height*width*sizeof(double));
    for(int j=0; j<width;j++)
    {
        if(myrank == 0)
           grid[j] = 40.0;
        if(myrank == worldsize-1)
            grid[j+(height-1)*width] = -273.15;
    }
    for(int j=1; j<height-1;j++)
    {
        grid[j*width] = -273.15;
        grid[j*width+width-1]= -273.15;
    }

    MPI_Barrier(MPI_COMM_WORLD);
    bp.timer_start();
    openmp(height,width,grid,iter);

    MPI_Barrier(MPI_COMM_WORLD);
    bp.timer_stop();

    if (myrank == 0) {
        bp.print("heat_equation(c99_omp_mpi)");
    }
    free(grid);
    MPI_Finalize();
    return 0;
}

C99 Seq¶

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <bp_util.h>

void sequential(int height, int width, double *grid, int iter)
{
  double *T = (double*)malloc(height*width*sizeof(double));
  memcpy(T, grid, height*width*sizeof(double));
  for(int n=0; n<iter; ++n)
  {
    double *a = grid;
    double *t = T;
    double delta=0;
    for(int i=1; i<height-1; ++i)
    {
      double *up     = a+1;
      double *left   = a+width;
      double *right  = a+width+2;
      double *down   = a+1+width*2;
      double *center = a+width+1;
      double *t_center = t+width+1;
      for(int j=0; j<width-2; ++j)
      {
        *t_center = (*center + *up + *left + *right + *down) * 0.2;
        delta += fabs(*t_center - *center);
        center++;up++;left++;right++;down++;t_center++;
      }
      a += width;
      t += width;
    }
#ifdef DEBUG
    printf("Delta: %lf\n",delta);
#endif
    memcpy(grid, T, height*width*sizeof(double));
  }
  free(T);
}

int main (int argc, char **argv)
{
    bp_util_type bp = bp_util_create(argc, argv, 3);// Grab arguments
    if (bp.args.has_error) {
        return 1;
    }

    const int height    = bp.args.sizes[0];
    const int width     = bp.args.sizes[1];
    const int iter      = bp.args.sizes[2];

    size_t grid_size = height*width*sizeof(double);
    double *grid = (double*)malloc(grid_size);
    memset(grid, 0, grid_size);
    for(int j=0; j<width;j++)
    {
        grid[j] = 40.0;
        grid[j+(height-1)*width] = -273.15;
    }
    for(int j=1; j<height-1;j++)
    {
        grid[j*width] = -273.15;
        grid[j*width+width-1]= -273.15;
    }
#ifdef DEBUG
    for (int i = 0; i<height; i++)
    {
        for(int j=0; j<width;j++)
        {
            printf ("%lf ", grid[j+i*width]);
        }
        printf ("\n");
    }       
#endif
    bp.timer_start();
    sequential(height,width,grid,iter);
    bp.timer_stop();
#ifdef DEBUG
    for (int i = 0; i<height; i++)
    {
        for(int j=0; j<width;j++)
        {
            printf ("%lf ", grid[j+i*width]);
        }
        printf ("\n");
    }       
#endif
    bp.print("heat_equation(c99_seq)");

    free(grid);
    return 0;
}

Cpp11 Omp¶

#include <iostream>
#include <sstream>
#include <string>
#include <cmath>
#include <bp_util.h>

#define xdim 14000
#define ydim 14000

using namespace std;

template <typename T>
void solve(T grid, double epsilon, int max_iterations)
{
    T temp = new double[ydim][xdim];

    double delta = epsilon+1.0;
    int iterations = 0;            // Compute the heat equation

    while(delta>epsilon) {
        ++iterations;

        #pragma omp parallel for reduction(+:delta) collapse(2)
        for (int i=1; i<ydim-1; i++) {
            for(int j=1;j<xdim-1;j++) {
                temp[i][j] = (grid[i-1][j] + grid[i+1][j] + grid[i][j] + grid[i][j-1] + grid[i][j+1])*0.2;
                delta += abs(temp[i][j] - grid[i][j]);
            }
        }

        #pragma omp parallel for collapse(2)
        for (int i=1;i<ydim-1; i++) {
            for(int j=1;j<xdim-1;j++) {
                grid[i][j] = temp[i][j];
            }
        }

        if (iterations>=max_iterations) {
            break;
        }
    }
}

int main(int argc, char* argv[])
{
    bp_util_type bp = bp_util_create(argc, argv, 3);// Grab arguments
    if (bp.args.has_error) {
        return 1;
    }
    bp.timer_start();

    const int width     = bp.args.sizes[0];
    const int height    = bp.args.sizes[1];

    if (ydim != height) {
        cout << "Multidimensional arrays in C11 does not support dynamic size, so it has to be: " << ydim << "." << endl;
        return 0;
    }

    if (xdim != width) {
        cout << "Multidimensional arrays in C11 does not support dynamic size, so it has to be: " << xdim << "." << endl;
        return 0;
    }

    const int max_iterations = bp.args.sizes[2];

    double epsilon  = 0.005;

    auto grid = new double[ydim][xdim];
    #pragma omp parallel for collapse(2)
    for (int i=0; i<ydim; i++) {      // Initialize the grid
        for (int j=0;j<xdim;j++) {
            grid[i][j] = 0;
        }
    }
    for (int i=0; i<ydim; i++) {      // And borders
        grid[i][0]      = -273.15;
        grid[i][xdim-1] = -273.15;
    }
    for (int i=0; i<xdim; i++) {
        grid[0][i]      = -273.15;
        grid[ydim-1][i] = 40.0;
    }

    bp.timer_start();                                   // Start timer
    solve(grid, epsilon, max_iterations);
    bp.timer_stop();

    bp.print("heat_equation(cpp11_omp)");
    if (bp.args.verbose) {                             // and values.
        cout << ", \"output\": [";
        for (int i=0; i<10; ++i) {
            for (int j=0; j<10; ++j) {
                cout << grid[i][j] << ", ";
            }
            cout << endl;
        }
        cout << "]";
    }

    return 0;
}

Cpp11 Opencl¶

Error

There are issues with the implementation.

Two known issues:

* Implementation compiles (with warning) but execution is untested.

* Implementation does not use ``bp-util`` for argparsing and timing, getting it to run in a suite might be cumbersome...

#include <iostream>
#include <sys/time.h>
#include <CL/cl.hpp>
#include <stdexcept>
#include <sys/time.h>
#include <cstdio>
#include <fstream>
#include <sstream>
#include <utility>

cl::Context context;
std::vector<cl::Device> devices;
cl::CommandQueue commandQueue;
cl::Kernel kernel;

#define TPB 32
#ifndef DTYPE
#define DTYPE double
#endif
#define STR(s) #s
#define xSTR(s) STR(s)

int main (int argc, char** argv)
{
    if (argc != 5)
    {
        std::cout << "Usage: " << argv[0] << " HEAT-EQ-KERNEL-FILE WIDTH HEIGHT ITERATIONS" << std::endl;
        return 0;
    }
    // Commandline arguments
    char *opencl_file = argv[1];
    unsigned int width  = atoi(argv[2]);
    unsigned int height = atoi(argv[3]);
    unsigned int iter   = atoi(argv[4]);

    // Setup OpenCL execution system
    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    bool foundPlatform = false;
    std::vector<cl::Platform>::iterator it;
    for (it = platforms.begin(); it != platforms.end(); ++it)
    {
        try {
            cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*it)(),0};
            context = cl::Context(CL_DEVICE_TYPE_GPU, props);
            foundPlatform = true;
            break;
        }
        catch (cl::Error e)
        {
            foundPlatform = false;
        }
    }
    if (foundPlatform)
    {
        devices = context.getInfo<CL_CONTEXT_DEVICES>();
        commandQueue = cl::CommandQueue(context,devices[0],0);
    } else {
        throw std::runtime_error("Could not find valid OpenCL platform.");
    }

    // Read source file and compile code
    std::ifstream file(opencl_file, std::ios::in);
    if (!file.is_open())
    {
        throw std::runtime_error("Could not open source file.");
    }
    std::ostringstream srcstrs;
    srcstrs << "#define DTYPE " << xSTR(DTYPE) << "\n";
    srcstrs << file.rdbuf();
    std::string srcstr = srcstrs.str();
#ifdef DEBUG
    std::cout << srcstr << std::endl;
#endif
    cl::Program::Sources source(1,std::make_pair(srcstr.c_str(),0));
    cl::Program program(context, source);
    try {program.build(devices);}
    catch (cl::Error e)
    {
        std::cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl;
    }
    kernel = cl::Kernel(program, "heat_eq_jacobi");

    // Set up model
    unsigned int w = width+2;
    unsigned int h = height+2;
    size_t grid_size = w*h*sizeof(DTYPE);
    size_t delta_size = w*sizeof(DTYPE);
    DTYPE* grid = (DTYPE*)malloc(grid_size);
    memset(grid, 0, grid_size);
    for(int j=0; j<w;j++)
    {
        grid[j] = 40.0;
        grid[j+(h-1)*w] = -273.15;
    }
    for(int j=1; j<h-1;j++)
    {
        grid[j*w] = -273.15;
        grid[j*w+w-1]= -273.15;
    }
#ifdef DEBUG
    for (int i = 0; i<h; i++)
    {
        for(int j=0; j<w;j++)
        {
            printf ("%lf ", grid[j+i*w]);
        }
        printf ("\n");
    }
#endif
    // Start timing
    timeval t_start,t_end;
    gettimeofday(&t_start,NULL);

    // Set up buffers and copy data
    cl::Buffer inDev = cl::Buffer(context, CL_MEM_READ_WRITE, grid_size, NULL);
    cl::Buffer outDev = cl::Buffer(context, CL_MEM_READ_WRITE, grid_size, NULL);
    cl::Buffer deltaDev = cl::Buffer(context, CL_MEM_READ_WRITE, delta_size, NULL);
    cl::Buffer deltaHost = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, delta_size, NULL);
    DTYPE* deltaHostPtr = (DTYPE*)commandQueue.enqueueMapBuffer(deltaHost, CL_TRUE, CL_MAP_WRITE, 0, delta_size);
    commandQueue.enqueueWriteBuffer(inDev, CL_FALSE, 0, grid_size, grid);
    commandQueue.enqueueCopyBuffer(inDev, outDev, 0, 0, grid_size);

    // Iterating
    for(int n=0; n<iter; ++n)
    {
        kernel.setArg(0,width);
        kernel.setArg(1,height);
        kernel.setArg(2,inDev);
        kernel.setArg(3,outDev);
        kernel.setArg(4,deltaDev);
        commandQueue.enqueueNDRangeKernel(kernel, cl::NullRange,
                                          cl::NDRange((width/TPB+1)*TPB), cl::NDRange(TPB));
        commandQueue.enqueueReadBuffer(deltaDev, CL_FALSE, 0, delta_size, deltaHostPtr);
        commandQueue.finish();
        DTYPE delta = 0.0;
        for (unsigned int i = 0; i < width; ++i)
            delta += deltaHostPtr[i];
#ifdef DEBUG
        std::cout << "Delta: " << delta << std::endl;
#endif
        cl::Buffer tmp = inDev;
        inDev = outDev;
        outDev = tmp;
    }
    commandQueue.enqueueReadBuffer(inDev, CL_TRUE, 0, grid_size, grid);
    gettimeofday(&t_end,NULL);
#ifdef DEBUG
    for (int i = 0; i<h; i++)
    {
        for(int j=0; j<w;j++)
        {
            printf ("%lf ", grid[j+i*w]);
        }
        printf ("\n");
    }
#endif
    double d_time = (t_end.tv_sec - t_start.tv_sec) + (t_end.tv_usec - t_start.tv_usec)/1000000.0;
    std::cout << argv[0] << " - iter: " << iter << " size: " << width << "*" << height << " elapsed-time: " <<
        d_time << std::endl;
    return 0;
}

Csharp Numcil¶

#region Copyright
/*
This file is part of Bohrium and copyright (c) 2012 the Bohrium:
team <http://www.bh107.org>.

Bohrium is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as 
published by the Free Software Foundation, either version 3 
of the License, or (at your option) any later version.

Bohrium is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the 
GNU Lesser General Public License along with Bohrium. 

If not, see <http://www.gnu.org/licenses/>.
*/
#endregion

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using R = NumCIL.Range;

namespace HeatEquation
{
	using NumCIL.Double;
	using T = System.Double;

    public static class HeatEquationSolverDouble
    {
    	public static NdArray Create(long width, long height)
    	{
			var res = Generate.Zeroes(height + 2, width + 2);

			res[R.All, R.El(0)] = -273.5f;
			res[R.All, R.El(-1)] = -273.5f;
			res[R.El(0), R.All] = 40f;
			res[R.El(-1), R.All] = -273.5f;

			return res;
    	}
    
        public static Tuple<int, NdArray> Solve(NdArray full, long? fixedIterations = null)
        {			
            var center = full[R.Slice(1, -1),  R.Slice(1, -1) ];
            var north  = full[R.Slice(1, -1),  R.Slice(0, -2) ];
            var east   = full[R.Slice(0, -2),  R.Slice(1, -1) ];
            var west   = full[R.Slice(2,  0),  R.Slice(1, -1) ];
            var south  = full[R.Slice(1, -1),  R.Slice(2,  0) ];


            if (fixedIterations != null)
            {
            	for(var i =0; i < fixedIterations.Value; i++)
            		center[R.All] = 0.2f * (center + north + east + west + south);

				return new Tuple<int, NdArray>((int)fixedIterations.Value, full);
            }
            else
            {
				T epsilon = 0.005f;
	            T delta = epsilon + 1;

	            int iteration = 0;

	            while (epsilon < delta)
	            {
					iteration++;

					var work = 0.2f * (center + north + east + west + south);
					delta = (work - center).Abs().Sum();
					center[R.All] = work;
	            }

				return new Tuple<int, NdArray>(iteration, full);
            }

        }
    }
}

namespace HeatEquation
{
	using NumCIL.Float;
	using T = System.Single;

    public static class HeatEquationSolverSingle
    {
    	public static NdArray Create(long width, long height)
    	{
			var res = Generate.Zeroes(height + 2, width + 2);

			res[R.All, R.El(0)] = -273.5f;
			res[R.All, R.El(-1)] = -273.5f;
			res[R.El(0), R.All] = 40f;
			res[R.El(-1), R.All] = -273.5f;

			return res;
    	}
    
        public static Tuple<int, NdArray> Solve(NdArray full, long? fixedIterations = null)
        {			
            var center = full[R.Slice(1, -1),  R.Slice(1, -1) ];
            var north  = full[R.Slice(1, -1),  R.Slice(0, -2) ];
            var east   = full[R.Slice(0, -2),  R.Slice(1, -1) ];
            var west   = full[R.Slice(2,  0),  R.Slice(1, -1) ];
            var south  = full[R.Slice(1, -1),  R.Slice(2,  0) ];


            if (fixedIterations != null)
            {
            	for(var i =0; i < fixedIterations.Value; i++)
            		center[R.All] = 0.2f * (center + north + east + west + south);

				return new Tuple<int, NdArray>((int)fixedIterations.Value, full);
            }
            else
            {
				T epsilon = 0.005f;
	            T delta = epsilon + 1;

	            int iteration = 0;

	            while (epsilon < delta)
	            {
					iteration++;

					var work = 0.2f * (center + north + east + west + south);
					delta = (work - center).Abs().Sum();
					center[R.All] = work;
	            }

				return new Tuple<int, NdArray>(iteration, full);
            }

        }
    }
}