Shallow Water

Python Numpy

from __future__ import print_function

"""
Shallow Water
-------------

So what does this code example illustrate?

Adapted from: http://people.sc.fsu.edu/~jburkardt/m_src/shallow_water_2d/
"""
import numpy as np
from benchpress.benchmarks import util

bench = util.Benchmark("Shallow Water", "height*width*iterations")
g = 9.80665  # gravitational acceleration


def droplet(height, width, data_type=np.float32):
    """Generate grid of droplets"""

    x = np.array(np.linspace(-1, 1, num=width, endpoint=True), dtype=data_type)
    y = np.array(np.linspace(-1, 1, num=width, endpoint=True), dtype=data_type)
    (xx, yy) = np.meshgrid(x, y)
    droplet = height * np.exp(-5 * (xx ** 2 + yy ** 2))
    return droplet


def model(height, width, dtype=np.float32):
    assert height >= 16
    assert width >= 16
    m = np.ones((height, width), dtype=dtype)
    D = droplet(8, 8)  # simulate a water drop
    droploc = height // 4
    (dropx, dropy) = D.shape
    m[droploc:droploc + dropx, droploc:droploc + dropy] += D
    droploc = height // 2
    (dropx, dropy) = D.shape
    m[droploc:droploc + dropx, droploc:droploc + dropy] += D
    return {"H": m, "U": np.zeros_like(m), "V": np.zeros_like(m)}


def simulate(state, timesteps):
    # FLOP count: i*(12*s + 4*s**2 + 14*s**2 + 9*s**2 + 4*s**2 + 9*s**2 + 14*s**2 + 6*s**2 + 19*s**2 + 19*s**2)
    # where s is size and i is iterations
    def loop_body(H, U, V, dt=0.02, dx=1.0, dy=1.0):
        # Reflecting boundary conditions
        H[:, 0] = H[:, 1];
        U[:, 0] = U[:, 1];
        V[:, 0] = -V[:, 1]
        H[:, -1] = H[:, -2];
        U[:, -1] = U[:, -2];
        V[:, -1] = -V[:, -2]
        H[0, :] = H[1, :];
        U[0, :] = -U[1, :];
        V[0, :] = V[1, :]
        H[-1, :] = H[-2, :];
        U[-1, :] = -U[-2, :];
        V[-1, :] = V[-2, :]

        # First half step

        # height
        Hx = (H[1:, 1:-1] + H[:-1, 1:-1]) / 2 - dt / (2 * dx) * (U[1:, 1:-1] - U[:-1, 1:-1])

        # x momentum
        Ux = (U[1:, 1:-1] + U[:-1, 1:-1]) / 2 - \
             dt / (2 * dx) * ((U[1:, 1:-1] ** 2 / H[1:, 1:-1] + g / 2 * H[1:, 1:-1] ** 2) -
                              (U[:-1, 1:-1] ** 2 / H[:-1, 1:-1] + g / 2 * H[:-1, 1:-1] ** 2))

        # y momentum
        Vx = (V[1:, 1:-1] + V[:-1, 1:-1]) / 2 - \
             dt / (2 * dx) * ((U[1:, 1:-1] * V[1:, 1:-1] / H[1:, 1:-1]) -
                              (U[:-1, 1:-1] * V[:-1, 1:-1] / H[:-1, 1:-1]))

        # height
        Hy = (H[1:-1, 1:] + H[1:-1, :-1]) / 2 - dt / (2 * dy) * (V[1:-1, 1:] - V[1:-1, :-1])

        # x momentum
        Uy = (U[1:-1, 1:] + U[1:-1, :-1]) / 2 - \
             dt / (2 * dy) * ((V[1:-1, 1:] * U[1:-1, 1:] / H[1:-1, 1:]) -
                              (V[1:-1, :-1] * U[1:-1, :-1] / H[1:-1, :-1]))
        # y momentum
        Vy = (V[1:-1, 1:] + V[1:-1, :-1]) / 2 - \
             dt / (2 * dy) * ((V[1:-1, 1:] ** 2 / H[1:-1, 1:] + g / 2 * H[1:-1, 1:] ** 2) -
                              (V[1:-1, :-1] ** 2 / H[1:-1, :-1] + g / 2 * H[1:-1, :-1] ** 2))

        # Second half step

        # height
        H[1:-1, 1:-1] -= (dt / dx) * (Ux[1:, :] - Ux[:-1, :]) + (dt / dy) * (Vy[:, 1:] - Vy[:, :-1])

        # x momentum
        U[1:-1, 1:-1] -= (dt / dx) * ((Ux[1:, :] ** 2 / Hx[1:, :] + g / 2 * Hx[1:, :] ** 2) -
                                      (Ux[:-1, :] ** 2 / Hx[:-1, :] + g / 2 * Hx[:-1, :] ** 2)) + \
                         (dt / dy) * ((Vy[:, 1:] * Uy[:, 1:] / Hy[:, 1:]) -
                                      (Vy[:, :-1] * Uy[:, :-1] / Hy[:, :-1]))
        # y momentum
        V[1:-1, 1:-1] -= (dt / dx) * ((Ux[1:, :] * Vx[1:, :] / Hx[1:, :]) -
                                      (Ux[:-1, :] * Vx[:-1, :] / Hx[:-1, :])) + \
                         (dt / dy) * ((Vy[:, 1:] ** 2 / Hy[:, 1:] + g / 2 * Hy[:, 1:] ** 2) -
                                      (Vy[:, :-1] ** 2 / Hy[:, :-1] + g / 2 * Hy[:, :-1] ** 2))
        bench.plot_surface(H, "3d", 0, 0, 5.5)

    bench.do_while(loop_body, timesteps, state['H'], state['U'], state['V'])


def main():
    H = bench.args.size[0]
    W = bench.args.size[1]
    I = bench.args.size[2]

    state = bench.load_data()
    if state is None:
        state = model(H, W, dtype=bench.dtype)

    bench.start()
    simulate(state, I)
    bench.stop()
    bench.save_data(state)
    bench.pprint()


if __name__ == "__main__":
    main()

C99 Seq

// Adapted from: http://people.sc.fsu.edu/~jburkardt/m_src/shallow_water_2d/
// Saved images may be converted into an animated gif with:
// convert   -delay 20   -loop 0   swater*.png   swater.gif


#include <math.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <bp_util.h>

#define iH(y,x) (H[(x)+(y)*(n+2)])
#define iU(y,x) (U[(x)+(y)*(n+2)])
#define iV(y,x) (V[(x)+(y)*(n+2)])

#define iHx(y,x) (Hx[(x)+(y)*(n+1)])
#define iUx(y,x) (Ux[(x)+(y)*(n+1)])
#define iVx(y,x) (Vx[(x)+(y)*(n+1)])

#define iHy(y,x) (Hy[(x)+(y)*(n+1)])
#define iUy(y,x) (Uy[(x)+(y)*(n+1)])
#define iVy(y,x) (Vy[(x)+(y)*(n+1)])

int main (int argc, char **argv)
{
    bp_util_type bp = bp_util_create(argc, argv, 3);
    if (bp.args.has_error) {
        return 1;
    }
    const int n = bp.args.sizes[0];
    const int other_n = bp.args.sizes[1];
    const int T = bp.args.sizes[2];

    if (n != other_n) {
        fprintf(stderr, "Implementation only supports quadratic grid.\n");
        exit(0);
    }

    const double g   = 9.8;       // gravitational constant
    const double dt  = 0.02;      // hardwired timestep
    const double dx  = 1.0;
    const double dy  = 1.0;
    const int droploc = n/4;

    double *H = (double*) malloc((n+2)*(n+2)*sizeof(double));
    double *U = (double*) malloc((n+2)*(n+2)*sizeof(double));
    double *V = (double*) malloc((n+2)*(n+2)*sizeof(double));

    double *Hx = (double*) malloc((n+1)*(n+1)*sizeof(double));
    double *Ux = (double*) malloc((n+1)*(n+1)*sizeof(double));
    double *Vx = (double*) malloc((n+1)*(n+1)*sizeof(double));

    double *Hy = (double*) malloc((n+1)*(n+1)*sizeof(double));
    double *Uy = (double*) malloc((n+1)*(n+1)*sizeof(double));
    double *Vy = (double*) malloc((n+1)*(n+1)*sizeof(double));


    for(int i=0; i<n+2; i++)
        for(int j=0; j<n+2; j++)
        {
          iH(i,j)=0.1;
          iU(i,j)=0.1;
          iV(i,j)=0.1;
        }


    for(int i=0; i<n+1; i++)
        for(int j=0; j<n+1; j++)
        {
          iHx(i,j)=0.1;
          iUx(i,j)=0.1;
          iVx(i,j)=0.1;

          iHy(i,j)=0.1;
          iUy(i,j)=0.1;
          iVy(i,j)=0.1;
        }

    iH(droploc,droploc) += 5.0;

    
    bp.timer_start();

    for(int iter=0; iter < T; iter++)
    {
        // Reflecting boundary conditions
        for(int i=0; i<n+2; i++)
        {
            iH(i,0) = iH(i,1)   ; iU(i,0) = iU(i,1)     ; iV(i,0) = -iV(i,1);
            iH(i,n+1) = iH(i,n) ; iU(i,n+1) = iU(i,n)   ; iV(i,n+1) = -iV(i,n);
            iH(0,i) = iH(1,i)   ; iU(0,i) = -iU(1,i)    ; iV(0,i) = iV(1,i);
            iH(n+1,i) = iH(n,i) ; iU(n+1,i) = -iU(n,i)  ; iV(n+1,i) = iV(n,i);
        }
        //
        // First half step
        //
        for(int i=0; i<n+1; i++)
        {
            for(int j=0; j<n; j++)
            {

                // height
                iHx(i,j) = (iH(i+1,j+1)+iH(i,j+1))/2 - dt/(2*dx)*(iU(i+1,j+1)-iU(i,j+1));

                // x momentum
                iUx(i,j) = (iU(i+1,j+1)+iU(i,j+1))/2 -          \
                dt/(2*dx)*((pow(iU(i+1,j+1),2)/iH(i+1,j+1) +	\
                          g/2*pow(iH(i+1,j+1),2)) -		\
                         (pow(iU(i,j+1),2)/iH(i,j+1) +	        \
                          g/2*pow(iH(i,j+1),2)));

                // y momentum
                iVx(i,j) = (iV(i+1,j+1)+iV(i,j+1))/2 -          \
                      dt/(2*dx)*((iU(i+1,j+1) *                 \
                                  iV(i+1,j+1)/iH(i+1,j+1)) -    \
                                 (iU(i,j+1) *                   \
                                  iV(i,j+1)/iH(i,j+1)));
            }
        }

        for(int i=0; i<n; i++)
        {
            for(int j=0; j<n+1; j++)
            {
                //height
                iHy(i,j) = (iH(i+1,j+1)+iH(i+1,j))/2 - dt/(2*dy)*(iV(i+1,j+1)-iV(i+1,j));

                //x momentum
                iUy(i,j) = (iU(i+1,j+1)+iU(i+1,j))/2 -	   \
                dt/(2*dy)*((iV(i+1,j+1) *                  \
                           iU(i+1,j+1)/iH(i+1,j+1)) -	   \
                                (iV(i+1,j) *               \
                                 iU(i+1,j)/iH(i+1,j)));

                //y momentum
                iVy(i,j) = (iV(i+1,j+1)+iV(i+1,j))/2 -	\
                dt/(2*dy)*((pow(iV(i+1,j+1),2)/iH(i+1,j+1) +	\
                           g/2*pow(iH(i+1,j+1),2)) -	\
                          (pow(iV(i+1,j),2)/iH(i+1,j) +	\
                           g/2*pow(iH(i+1,j),2)));
            }
        }
        //
        // Second half step
        //

        for(int i=1; i<n+1; i++)
        {
            for(int j=1; j<n+1; j++)
            {
                //height
                iH(i,j) -= (dt/dx)*(iUx(i,j-1)-iUx(i-1,j-1)) - (dt/dy)*(iVy(i-1,j)-iVy(i-1,j-1));

                // x momentum
                iU(i,j) -= (dt/dx)*((pow(iUx(i,j-1),2)/iHx(i,j-1) + g/2*pow(iHx(i,j-1),2)) -        \
                                    (pow(iUx(i-1,j-1),2)/iHx(i-1,j-1) + g/2*pow(iHx(i-1,j-1),2))) - \
                           (dt/dy)*((iVy(i-1,j) * iUy(i-1,j)/iHy(i-1,j)) -
                                    (iVy(i-1,j-1) * iUy(i-1,j-1)/iHy(i-1,j-1)));

                // y momentum    - score
                iV(i,j) -= (dt/dx)*((iUx(i,j-1) * iVx(i,j-1)/iHx(i,j-1)) -                         \
                                    (iUx(i-1,j-1)*iVx(i-1,j-1)/iHx(i-1,j-1))) -                    \
                           (dt/dy)*((pow(iVy(i-1,j),2)/iHy(i-1,j) + g/2*pow(iHy(i-1,j),2)) -       \
                                    (pow(iVy(i-1,j-1),2)/iHy(i-1,j-1) + g/2*pow(iHy(i-1,j-1),2)));

            }
        }
    }

    /* NumPy implementation does not do this!
    // res = numpy.add.reduce(numpy.add.reduce(H / n))
    double res = 0.0;
    for(int i=0;i<n+2;i++)
        for(int j=0;j<n+2;j++)
            res+=iH(i,j)/n;
    */
    bp.timer_stop();
    bp.print("shallow_water(c99_seq)");
}

Cpp11 Boost

// Adapted from: http://people.sc.fsu.edu/~jburkardt/m_src/shallow_water_2d/
// Saved images may be converted into an animated gif with:
// convert   -delay 20   -loop 0   swater*.png   swater.gif
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <bp_util.h>

#include <boost/multi_array.hpp>

int main (int argc, char **argv)
{
    bp_util_type bp = bp_util_create(argc, argv, 3);
    if (bp.args.has_error) {
        return 1;
    }
    const int n         = bp.args.sizes[0];
    const int other_n   = bp.args.sizes[1];
    const int T         = bp.args.sizes[2];

    if (n != other_n) {
        fprintf(stderr, "Implementation only supports quadratic grid.\n");
        exit(0);
    }

    const double g   = 9.8;       // gravitational constant
    const double dt  = 0.02;      // hardwired timestep
    const double dx  = 1.0;
    const double dy  = 1.0;
    const int droploc = n/4;

    typedef boost::multi_array<double, 2> array_type;

    array_type H(boost::extents[n+2][n+2]);
    array_type U(boost::extents[n+2][n+2]);
    array_type V(boost::extents[n+2][n+2]);

    array_type Hx(boost::extents[n+1][n+1]);
    array_type Ux(boost::extents[n+1][n+1]);
    array_type Vx(boost::extents[n+1][n+1]);
    array_type Hy(boost::extents[n+1][n+1]);
    array_type Uy(boost::extents[n+1][n+1]);
    array_type Vy(boost::extents[n+1][n+1]);


    for(int i=0; i<n+2; i++)
        for(int j=0; j<n+2; j++)
        {
          H[i][j]=0.1;
          U[i][j]=0.1;
          V[i][j]=0.1;
        }


    for(int i=0; i<n+1; i++)
        for(int j=0; j<n+1; j++)
        {
          Hx[i][j]=0.1;
          Ux[i][j]=0.1;
          Vx[i][j]=0.1;

          Hy[i][j]=0.1;
          Uy[i][j]=0.1;
          Vy[i][j]=0.1;
        }

    H[droploc][droploc] += 5.0;

    bp.timer_start();

    for(int iter=0; iter < T; iter++)
    {
        // Reflecting boundary conditions
        for(int i=0; i<n+2; i++)
        {
            H[i][0] = H[i][1]   ; U[i][0] = U[i][1]     ; V[i][0] = -V[i][1];
            H[i][n+1] = H[i][n] ; U[i][n+1] = U[i][n]   ; V[i][n+1] = -V[i][n];
            H[0][i] = H[1][i]   ; U[0][i] = -U[1][i]    ; V[0][i] = V[1][i];
            H[n+1][i] = H[n][i] ; U[n+1][i] = -U[n][i]  ; V[n+1][i] = V[n][i];
        }
        //
        // First half step
        //
        for(int i=0; i<n+1; i++)
        {
            for(int j=0; j<n; j++)
            {

                // height
                Hx[i][j] = (H[i+1][j+1]+H[i][j+1])/2 - dt/(2*dx)*(U[i+1][j+1]-U[i][j+1]);

                // x momentum
                Ux[i][j] = (U[i+1][j+1]+U[i][j+1])/2 -          \
                dt/(2*dx)*((pow(U[i+1][j+1],2)/H[i+1][j+1] +	\
                          g/2*pow(H[i+1][j+1],2)) -		\
                         (pow(U[i][j+1],2)/H[i][j+1] +	        \
                          g/2*pow(H[i][j+1],2)));

                // y momentum
                Vx[i][j] = (V[i+1][j+1]+V[i][j+1])/2 -          \
                      dt/(2*dx)*((U[i+1][j+1] *                 \
                                  V[i+1][j+1]/H[i+1][j+1]) -    \
                                 (U[i][j+1] *                   \
                                  V[i][j+1]/H[i][j+1]));
            }
        }

        for(int i=0; i<n; i++)
        {
            for(int j=0; j<n+1; j++)
            {
                //height
                Hy[i][j] = (H[i+1][j+1]+H[i+1][j])/2 - dt/(2*dy)*(V[i+1][j+1]-V[i+1][j]);

                //x momentum
                Uy[i][j] = (U[i+1][j+1]+U[i+1][j])/2 -	   \
                dt/(2*dy)*((V[i+1][j+1] *                  \
                           U[i+1][j+1]/H[i+1][j+1]) -	   \
                                (V[i+1][j] *               \
                                 U[i+1][j]/H[i+1][j]));

                //y momentum
                Vy[i][j] = (V[i+1][j+1]+V[i+1][j])/2 -	\
                dt/(2*dy)*((pow(V[i+1][j+1],2)/H[i+1][j+1] +	\
                           g/2*pow(H[i+1][j+1],2)) -	\
                          (pow(V[i+1][j],2)/H[i+1][j] +	\
                           g/2*pow(H[i+1][j],2)));
            }
        }
        //
        // Second half step
        //

        for(int i=1; i<n+1; i++)
        {
            for(int j=1; j<n+1; j++)
            {
                //height
                H[i][j] -= (dt/dx)*(Ux[i][j-1]-Ux[i-1][j-1]) - (dt/dy)*(Vy[i-1][j]-Vy[i-1][j-1]);

                // x momentum
                U[i][j] -= (dt/dx)*((pow(Ux[i][j-1],2)/Hx[i][j-1] + g/2*pow(Hx[i][j-1],2)) -        \
                                    (pow(Ux[i-1][j-1],2)/Hx[i-1][j-1] + g/2*pow(Hx[i-1][j-1],2))) - \
                           (dt/dy)*((Vy[i-1][j] * Uy[i-1][j]/Hy[i-1][j]) -
                                    (Vy[i-1][j-1] * Uy[i-1][j-1]/Hy[i-1][j-1]));

                // y momentum    - score
                V[i][j] -= (dt/dx)*((Ux[i][j-1] * Vx[i][j-1]/Hx[i][j-1]) -                         \
                                    (Ux[i-1][j-1]*Vx[i-1][j-1]/Hx[i-1][j-1])) -                    \
                           (dt/dy)*((pow(Vy[i-1][j],2)/Hy[i-1][j] + g/2*pow(Hy[i-1][j],2)) -       \
                                    (pow(Vy[i-1][j-1],2)/Hy[i-1][j-1] + g/2*pow(Hy[i-1][j-1],2)));

            }
        }
    }

    // res = numpy.add.reduce(numpy.add.reduce(H / n))
    double res = 0.0;
    for(int i=0;i<n+2;i++)
        for(int j=0;j<n+2;j++)
            res+=H[i][j]/n;

    bp.timer_stop();
    bp.print("shallow_water(cpp11_boost)");
}

Cpp11 Omp

// Adapted from: http://people.sc.fsu.edu/~jburkardt/m_src/shallow_water_2d/
// Saved images may be converted into an animated gif with:
// convert   -delay 20   -loop 0   swater*.png   swater.gif
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <bp_util.h>

#define WIDTH 5000
#define HEIGHT 5000

int main (int argc, char **argv)
{
    bp_util_type bp = bp_util_create(argc, argv, 3);
    if (bp.args.has_error) {
        return 1;
    }
    const int height        = bp.args.sizes[0];
    const int width         = bp.args.sizes[1];
    const int iterations    = bp.args.sizes[2];

    if (width != WIDTH) {
        fprintf(stderr, "Unsupported size, size must be %dx%d\n", WIDTH, HEIGHT);
        exit(0);
    }
    if (height != WIDTH) {
        fprintf(stderr, "Unsupported size, size must be %dx%d\n", WIDTH, HEIGHT);
        exit(0);
    }

    if (width != height) {
        fprintf(stderr, "Implementation only supports quadratic grid.\n");
        exit(0);
    }

    const double g   = 9.8;       // gravitational constant
    const double dt  = 0.02;      // hardwired timestep
    const double dx  = 1.0;
    const double dy  = 1.0;
    const int droploc = WIDTH/4;

    auto H = new double[WIDTH+2][WIDTH+2];
    auto U = new double[WIDTH+2][WIDTH+2];
    auto V = new double[WIDTH+2][WIDTH+2];

    auto Hx = new double[WIDTH+1][WIDTH+1];
    auto Ux = new double[WIDTH+1][WIDTH+1];
    auto Vx = new double[WIDTH+1][WIDTH+1];
    auto Hy = new double[WIDTH+1][WIDTH+1];
    auto Uy = new double[WIDTH+1][WIDTH+1];
    auto Vy = new double[WIDTH+1][WIDTH+1];

    #pragma omp parallel for collapse(2)
    for(int i=0; i<HEIGHT+2; i++) {
        for(int j=0; j<WIDTH+2; j++) {
          H[i][j]=0.1;
          U[i][j]=0.1;
          V[i][j]=0.1;
        }
    }

    #pragma omp parallel for collapse(2)
    for(int i=0; i<HEIGHT+1; i++) {
        for(int j=0; j<WIDTH+1; j++) {
          Hx[i][j]=0.1;
          Ux[i][j]=0.1;
          Vx[i][j]=0.1;

          Hy[i][j]=0.1;
          Uy[i][j]=0.1;
          Vy[i][j]=0.1;
        }
    }
    H[droploc][droploc] += 5.0;

    bp.timer_start();

    for(int iter=0; iter < iterations; iter++) {

        #pragma omp parallel for        
        for(int i=0; i<WIDTH+2; i++) {  // Reflecting boundary conditions
            H[i][0] = H[i][1];
            U[i][0] = U[i][1];
            V[i][0] = -V[i][1];
            H[i][WIDTH+1] = H[i][WIDTH];
            U[i][WIDTH+1] = U[i][WIDTH];
            V[i][WIDTH+1] = -V[i][WIDTH];
            H[0][i] = H[1][i];
            U[0][i] = -U[1][i];
            V[0][i] = V[1][i];
            H[HEIGHT+1][i] = H[HEIGHT][i];
            U[HEIGHT+1][i] = -U[HEIGHT][i];
            V[HEIGHT+1][i] = V[HEIGHT][i];
        }

        //
        // First half step
        //
        #pragma omp parallel for collapse(2)
        for(int i=0; i<HEIGHT+1; i++) {
            for(int j=0; j<WIDTH; j++) {

                // height
                Hx[i][j] = (H[i+1][j+1]+H[i][j+1])/2 - dt/(2*dx)*(U[i+1][j+1]-U[i][j+1]);

                // x momentum
                Ux[i][j] = (U[i+1][j+1]+U[i][j+1])/2 -          \
                dt/(2*dx)*((pow(U[i+1][j+1],2)/H[i+1][j+1] +	\
                          g/2*pow(H[i+1][j+1],2)) -		\
                         (pow(U[i][j+1],2)/H[i][j+1] +	        \
                          g/2*pow(H[i][j+1],2)));

                // y momentum
                Vx[i][j] = (V[i+1][j+1]+V[i][j+1])/2 -          \
                      dt/(2*dx)*((U[i+1][j+1] *                 \
                                  V[i+1][j+1]/H[i+1][j+1]) -    \
                                 (U[i][j+1] *                   \
                                  V[i][j+1]/H[i][j+1]));
            }
        }

        #pragma omp parallel for collapse(2)
        for(int i=0; i<HEIGHT; i++) {
            for(int j=0; j<WIDTH+1; j++) {
                //height
                Hy[i][j] = (H[i+1][j+1]+H[i+1][j])/2 - dt/(2*dy)*(V[i+1][j+1]-V[i+1][j]);

                //x momentum
                Uy[i][j] = (U[i+1][j+1]+U[i+1][j])/2 -	   \
                dt/(2*dy)*((V[i+1][j+1] *                  \
                           U[i+1][j+1]/H[i+1][j+1]) -	   \
                                (V[i+1][j] *               \
                                 U[i+1][j]/H[i+1][j]));

                //y momentum
                Vy[i][j] = (V[i+1][j+1]+V[i+1][j])/2 -	\
                dt/(2*dy)*((pow(V[i+1][j+1],2)/H[i+1][j+1] +	\
                           g/2*pow(H[i+1][j+1],2)) -	\
                          (pow(V[i+1][j],2)/H[i+1][j] +	\
                           g/2*pow(H[i+1][j],2)));
            }
        }

        //
        // Second half step
        //

        #pragma omp parallel for collapse(2)
        for(int i=1; i<HEIGHT+1; i++) {
            for(int j=1; j<WIDTH+1; j++) {
                //height
                H[i][j] -= (dt/dx)*(Ux[i][j-1]-Ux[i-1][j-1]) - (dt/dy)*(Vy[i-1][j]-Vy[i-1][j-1]);

                // x momentum
                U[i][j] -= (dt/dx)*((pow(Ux[i][j-1],2)/Hx[i][j-1] + g/2*pow(Hx[i][j-1],2)) -        \
                                    (pow(Ux[i-1][j-1],2)/Hx[i-1][j-1] + g/2*pow(Hx[i-1][j-1],2))) - \
                           (dt/dy)*((Vy[i-1][j] * Uy[i-1][j]/Hy[i-1][j]) -
                                    (Vy[i-1][j-1] * Uy[i-1][j-1]/Hy[i-1][j-1]));

                // y momentum    - score
                V[i][j] -= (dt/dx)*((Ux[i][j-1] * Vx[i][j-1]/Hx[i][j-1]) -                         \
                                    (Ux[i-1][j-1]*Vx[i-1][j-1]/Hx[i-1][j-1])) -                    \
                           (dt/dy)*((pow(Vy[i-1][j],2)/Hy[i-1][j] + g/2*pow(Hy[i-1][j],2)) -       \
                                    (pow(Vy[i-1][j-1],2)/Hy[i-1][j-1] + g/2*pow(Hy[i-1][j-1],2)));
            }
        }
    }

    /* NumPy implementation does not do this!?
    // res = numpy.add.reduce(numpy.add.reduce(H / n))
    double res = 0.0;
    #pragma omp parallel for collapse(2) reduction(+:res)
    for(int i=0;i<HEIGHT+2;i++) {
        for(int j=0;j<WIDTH+2;j++) {
            res+=H[i][j]/WIDTH;
        }
    }
    */

    bp.timer_stop();
    bp.print("shallow_water(cpp11_omp)");
}

Cpp11 Seq

// Adapted from: http://people.sc.fsu.edu/~jburkardt/m_src/shallow_water_2d/
// Saved images may be converted into an animated gif with:
// convert   -delay 20   -loop 0   swater*.png   swater.gif
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <bp_util.h>

#define WIDTH 5000
#define HEIGHT 5000

int main (int argc, char **argv)
{
    bp_util_type bp = bp_util_create(argc, argv, 3);
    if (bp.args.has_error) {
        return 1;
    }
    const int height    = bp.args.sizes[0];
    const int width     = bp.args.sizes[1];
    const int T         = bp.args.sizes[2];

    if (width != WIDTH) {
        fprintf(stderr, "Unsupported size, size must be %dx%d\n", WIDTH, HEIGHT);
        exit(0);
    }
    if (height != WIDTH) {
        fprintf(stderr, "Unsupported size, size must be %dx%d\n", WIDTH, HEIGHT);
        exit(0);
    }

    if (width != height) {
        fprintf(stderr, "Implementation only supports quadratic grid.\n");
        exit(0);
    }

    const double g   = 9.8;       // gravitational constant
    const double dt  = 0.02;      // hardwired timestep
    const double dx  = 1.0;
    const double dy  = 1.0;
    const int droploc = WIDTH/4;

    auto H = new double[WIDTH+2][WIDTH+2];
    auto U = new double[WIDTH+2][WIDTH+2];
    auto V = new double[WIDTH+2][WIDTH+2];

    auto Hx = new double[WIDTH+1][WIDTH+1];
    auto Ux = new double[WIDTH+1][WIDTH+1];
    auto Vx = new double[WIDTH+1][WIDTH+1];
    auto Hy = new double[WIDTH+1][WIDTH+1];
    auto Uy = new double[WIDTH+1][WIDTH+1];
    auto Vy = new double[WIDTH+1][WIDTH+1];

    for(int i=0; i<HEIGHT+2; i++) {
        for(int j=0; j<WIDTH+2; j++) {
          H[i][j]=0.1;
          U[i][j]=0.1;
          V[i][j]=0.1;
        }
    }


    for(int i=0; i<HEIGHT+1; i++) {
        for(int j=0; j<WIDTH+1; j++) {
          Hx[i][j]=0.1;
          Ux[i][j]=0.1;
          Vx[i][j]=0.1;

          Hy[i][j]=0.1;
          Uy[i][j]=0.1;
          Vy[i][j]=0.1;
        }
    }
    H[droploc][droploc] += 5.0;

    bp.timer_start();

    for(int iter=0; iter < T; iter++) {
        
        for(int i=0; i<WIDTH+2; i++) {  // Reflecting boundary conditions
            H[i][0] = H[i][1];
            U[i][0] = U[i][1];
            V[i][0] = -V[i][1];
            H[i][WIDTH+1] = H[i][WIDTH];
            U[i][WIDTH+1] = U[i][WIDTH];
            V[i][WIDTH+1] = -V[i][WIDTH];
            H[0][i] = H[1][i];
            U[0][i] = -U[1][i];
            V[0][i] = V[1][i];
            H[HEIGHT+1][i] = H[HEIGHT][i];
            U[HEIGHT+1][i] = -U[HEIGHT][i];
            V[HEIGHT+1][i] = V[HEIGHT][i];
        }

        //
        // First half step
        //

        for(int i=0; i<HEIGHT+1; i++) {
            for(int j=0; j<WIDTH; j++) {

                // height
                Hx[i][j] = (H[i+1][j+1]+H[i][j+1])/2 - dt/(2*dx)*(U[i+1][j+1]-U[i][j+1]);

                // x momentum
                Ux[i][j] = (U[i+1][j+1]+U[i][j+1])/2 -          \
                dt/(2*dx)*((pow(U[i+1][j+1],2)/H[i+1][j+1] +	\
                          g/2*pow(H[i+1][j+1],2)) -		\
                         (pow(U[i][j+1],2)/H[i][j+1] +	        \
                          g/2*pow(H[i][j+1],2)));

                // y momentum
                Vx[i][j] = (V[i+1][j+1]+V[i][j+1])/2 -          \
                      dt/(2*dx)*((U[i+1][j+1] *                 \
                                  V[i+1][j+1]/H[i+1][j+1]) -    \
                                 (U[i][j+1] *                   \
                                  V[i][j+1]/H[i][j+1]));
            }
        }

        for(int i=0; i<HEIGHT; i++) {
            for(int j=0; j<WIDTH+1; j++) {
                //height
                Hy[i][j] = (H[i+1][j+1]+H[i+1][j])/2 - dt/(2*dy)*(V[i+1][j+1]-V[i+1][j]);

                //x momentum
                Uy[i][j] = (U[i+1][j+1]+U[i+1][j])/2 -	   \
                dt/(2*dy)*((V[i+1][j+1] *                  \
                           U[i+1][j+1]/H[i+1][j+1]) -	   \
                                (V[i+1][j] *               \
                                 U[i+1][j]/H[i+1][j]));

                //y momentum
                Vy[i][j] = (V[i+1][j+1]+V[i+1][j])/2 -	\
                dt/(2*dy)*((pow(V[i+1][j+1],2)/H[i+1][j+1] +	\
                           g/2*pow(H[i+1][j+1],2)) -	\
                          (pow(V[i+1][j],2)/H[i+1][j] +	\
                           g/2*pow(H[i+1][j],2)));
            }
        }

        //
        // Second half step
        //

        for(int i=1; i<HEIGHT+1; i++) {
            for(int j=1; j<WIDTH+1; j++) {
                //height
                H[i][j] -= (dt/dx)*(Ux[i][j-1]-Ux[i-1][j-1]) - (dt/dy)*(Vy[i-1][j]-Vy[i-1][j-1]);

                // x momentum
                U[i][j] -= (dt/dx)*((pow(Ux[i][j-1],2)/Hx[i][j-1] + g/2*pow(Hx[i][j-1],2)) -        \
                                    (pow(Ux[i-1][j-1],2)/Hx[i-1][j-1] + g/2*pow(Hx[i-1][j-1],2))) - \
                           (dt/dy)*((Vy[i-1][j] * Uy[i-1][j]/Hy[i-1][j]) -
                                    (Vy[i-1][j-1] * Uy[i-1][j-1]/Hy[i-1][j-1]));

                // y momentum    - score
                V[i][j] -= (dt/dx)*((Ux[i][j-1] * Vx[i][j-1]/Hx[i][j-1]) -                         \
                                    (Ux[i-1][j-1]*Vx[i-1][j-1]/Hx[i-1][j-1])) -                    \
                           (dt/dy)*((pow(Vy[i-1][j],2)/Hy[i-1][j] + g/2*pow(Hy[i-1][j],2)) -       \
                                    (pow(Vy[i-1][j-1],2)/Hy[i-1][j-1] + g/2*pow(Hy[i-1][j-1],2)));

            }
        }
    }

    /* NumPy implementation does not do this!?
    // res = numpy.add.reduce(numpy.add.reduce(H / n))
    double res = 0.0;
    for(int i=0;i<HEIGHT+2;i++) {
        for(int j=0;j<WIDTH+2;j++) {
            res+=H[i][j]/WIDTH;
        }
    }
    */

    bp.timer_stop();
    bp.print("shallow_water(cpp11_seq)");
}

Csharp Numcil

#region Copyright
/*
This file is part of Bohrium and copyright (c) 2012 the Bohrium:
team <http://www.bh107.org>.

Bohrium is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3
of the License, or (at your option) any later version.

Bohrium is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the
GNU Lesser General Public License along with Bohrium.

If not, see <http://www.gnu.org/licenses/>.
*/
#endregion

//Adapted from: http://people.sc.fsu.edu/~jburkardt/m_src/shallow_water_2d/

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using R = NumCIL.Range;
using NumCIL;

namespace shallow_water
{
    public static class ShallowWaterSolver
    {
        /// <summary>
        /// NumCIL equivalent to [:]
        /// </summary>
        public static readonly R ALL = R.All;
        /// <summary>
        /// NumCIL equivalent to [1:-1]
        /// </summary>
        public static readonly R INNER = R.Slice(1, -1);
        /// <summary>
        /// NumCIL equivalent to [0]
        /// </summary>
        public static readonly R FIRST = R.El(0);
        /// <summary>
        /// NumCIL equivalent to [1]
        /// </summary>
        public static readonly R SECOND = R.El(1);
        /// <summary>
        /// NumCIL equivalent to [-1]
        /// </summary>
        public static readonly R LAST = R.El(-1);
        /// <summary>
        /// NumCIL equivalent to [:-1]
        /// </summary>
        public static readonly R ZM1 = R.Slice(0, -1);
        /// <summary>
        /// NumCIL equivalent to [1:]
        /// </summary>
        public static readonly R SKIP1 = R.Slice(1, 0);


        public class DataDouble
        {
            public readonly long N;
            public readonly NumCIL.Double.NdArray H;
            public readonly NumCIL.Double.NdArray U;
            public readonly NumCIL.Double.NdArray V;
            public readonly NumCIL.Double.NdArray Hx;
            public readonly NumCIL.Double.NdArray Ux;
            public readonly NumCIL.Double.NdArray Vx;
            public readonly NumCIL.Double.NdArray Hy;
            public readonly NumCIL.Double.NdArray Uy;
            public readonly NumCIL.Double.NdArray Vy;

            public DataDouble(long n)
            {
                N = n;
                H = NumCIL.Double.Generate.Ones(n + 2, n + 2);
                U = NumCIL.Double.Generate.Ones(n + 2, n + 2);
                V = NumCIL.Double.Generate.Ones(n + 2, n + 2);
                Hx = NumCIL.Double.Generate.Ones(n + 1, n + 1);
                   Ux = NumCIL.Double.Generate.Ones(n + 1, n + 1);
                Vx = NumCIL.Double.Generate.Ones(n + 1, n + 1);
                   Hy = NumCIL.Double.Generate.Ones(n + 1, n + 1);
                Uy = NumCIL.Double.Generate.Ones(n + 1, n + 1);
                Vy = NumCIL.Double.Generate.Ones(n + 1, n + 1);
            }
        }


        public static System.Double SolveDouble(long timesteps, DataDouble data)
        {
            //Convenience indices
            R RN = R.El(data.N); //Same as [n]
            R RN1 = R.El(data.N + 1); //Same as [n+1]

            System.Double g = 9.8f;// gravitational constant
            System.Double dt = 0.02f; // hardwired timestep
            System.Double dx = 1.0f;
            System.Double dy = 1.0f;
            long droploc = data.N / 4;

            var H = data.H;
            var U = data.U;
            var V = data.V;
            var Hx = data.Hx;
            var Ux = data.Ux;
            var Vx = data.Vx;
            var Hy = data.Hy;
            var Uy = data.Uy;
            var Vy = data.Vy;

            //Splash!!!
            H[droploc, droploc] += 5.0f;

            for (int i = 0; i < timesteps; i++)
            {
                H.Flush();
                // Reflecting boundary conditions
                H[ALL, FIRST] = H[ALL, SECOND];
                U[ALL, FIRST] = U[ALL, SECOND];
                V[ALL, FIRST] = -V[ALL, SECOND];
                H[ALL, RN1] = H[ALL, RN];
                U[ALL, RN1] = U[ALL, RN];
                V[ALL, RN1] = -V[ALL, RN];
                H[FIRST, ALL] = H[SECOND, ALL];
                U[FIRST, ALL] = -U[SECOND, ALL];
                V[FIRST, ALL] = V[SECOND, ALL];
                H[RN1, ALL] = H[RN, ALL];
                U[RN1, ALL] = -U[RN, ALL];
                V[RN1, ALL] = V[RN, ALL];

                //First half-step

                //Height
                Hx[ALL, R.Slice(0, -1)] = (H[SKIP1,INNER]+H[ZM1,INNER])/2 -
                        dt/(2*dx)*(U[SKIP1,INNER]-U[ZM1,INNER]);

                //x momentum
                Ux[ALL, R.Slice(0, -1)] = (U[SKIP1,INNER]+U[ZM1,INNER])/2 -
                dt/(2*dx)*((U[SKIP1,INNER].Pow(2)/H[SKIP1,INNER] +
                            g/2*H[SKIP1,INNER].Pow(2)) -
                           (U[ZM1,INNER].Pow(2)/H[ZM1,INNER] +
                            g/2*H[ZM1,INNER].Pow(2)));

                // y momentum
                Vx[ALL, ZM1] = (V[SKIP1,INNER]+V[ZM1,INNER])/2 -
                            dt/(2*dx)*((U[SKIP1,INNER] *
                            V[SKIP1,INNER]/H[SKIP1, INNER]) -
                           (U[ZM1,INNER] *
                            V[ZM1,INNER]/H[ZM1,INNER]));



                // height
                Hy[ZM1,ALL] = (H[INNER,SKIP1]+H[INNER,ZM1])/2 -
                            dt/(2*dy)*(V[INNER,SKIP1]-V[INNER,ZM1]);

                // x momentum
                Uy[ZM1,ALL] = (U[INNER,SKIP1]+U[INNER,ZM1])/2 -
                            dt/(2*dy)*((V[INNER,SKIP1] *
                            U[INNER,SKIP1]/H[INNER,SKIP1]) -
                           (V[INNER,ZM1] *
                            U[INNER,ZM1]/H[INNER,ZM1]));
                // y momentum
                Vy[ZM1,ALL] = (V[INNER,SKIP1]+V[INNER,ZM1])/2 -
                            dt/(2*dy)*((V[INNER,SKIP1].Pow(2)/H[INNER,SKIP1] +
                            g/2*H[INNER,SKIP1].Pow(2)) -
                           (V[INNER,ZM1].Pow(2)/H[INNER,ZM1] +
                            g/2*H[INNER,ZM1].Pow(2)));

                // Second half step

                // height
                H[INNER, INNER] = H[INNER, INNER] -
                            (dt/dx)*(Ux[SKIP1,ZM1]-Ux[ZM1,ZM1]) -
                            (dt/dy)*(Vy[ZM1,SKIP1]-Vy[ZM1, ZM1]);

                // x momentum
                U[INNER, INNER] = U[INNER, INNER] -
                           (dt/dx)*((Ux[SKIP1,ZM1].Pow(2)/Hx[SKIP1,ZM1] +
                             g/2*Hx[SKIP1,ZM1].Pow(2)) -
                            (Ux[ZM1,ZM1].Pow(2)/Hx[ZM1,ZM1] +
                             g / 2 * Hx[ZM1, ZM1].Pow(2))) -
                             (dt/dy)*((Vy[ZM1,SKIP1] *
                                       Uy[ZM1,SKIP1]/Hy[ZM1,SKIP1]) -
                                        (Vy[ZM1,ZM1] *
                                         Uy[ZM1,ZM1]/Hy[ZM1,ZM1]));
                // y momentum
                V[R.Slice(1, -1),R.Slice(1, -1)] = V[R.Slice(1, -1),R.Slice(1, -1)] -
                               (dt/dx)*((Ux[SKIP1,ZM1] *
                                         Vx[SKIP1,ZM1]/Hx[SKIP1,ZM1]) -
                                        (Ux[ZM1,ZM1]*Vx[ZM1,ZM1]/Hx[ZM1,ZM1])) -
                                        (dt / dy) * ((Vy[ZM1, SKIP1].Pow(2) / Hy[ZM1, SKIP1] +
                                                  g / 2 * Hy[ZM1, SKIP1].Pow(2)) -
                                                 (Vy[ZM1, ZM1].Pow(2) / Hy[ZM1, ZM1] +
                                                  g / 2 * Hy[ZM1, ZM1].Pow(2)));
            }
            //Make sure we have the actual data and use it as a checksum
            return NumCIL.Double.Add.Reduce(NumCIL.Double.Add.Reduce(H / data.N)).Value[0];
        }
        public class DataFloat
        {
            public readonly long N;
            public readonly NumCIL.Float.NdArray H;
            public readonly NumCIL.Float.NdArray U;
            public readonly NumCIL.Float.NdArray V;
            public readonly NumCIL.Float.NdArray Hx;
            public readonly NumCIL.Float.NdArray Ux;
            public readonly NumCIL.Float.NdArray Vx;
            public readonly NumCIL.Float.NdArray Hy;
            public readonly NumCIL.Float.NdArray Uy;
            public readonly NumCIL.Float.NdArray Vy;

            public DataFloat(long n)
            {
                N = n;
                H = NumCIL.Float.Generate.Ones(n + 2, n + 2);
                U = NumCIL.Float.Generate.Ones(n + 2, n + 2);
                V = NumCIL.Float.Generate.Ones(n + 2, n + 2);
                Hx = NumCIL.Float.Generate.Ones(n + 1, n + 1);
                   Ux = NumCIL.Float.Generate.Ones(n + 1, n + 1);
                Vx = NumCIL.Float.Generate.Ones(n + 1, n + 1);
                   Hy = NumCIL.Float.Generate.Ones(n + 1, n + 1);
                Uy = NumCIL.Float.Generate.Ones(n + 1, n + 1);
                Vy = NumCIL.Float.Generate.Ones(n + 1, n + 1);
            }
        }


        public static System.Single SolveFloat(long timesteps, DataFloat data)
        {
            //Convenience indices
            R RN = R.El(data.N); //Same as [n]
            R RN1 = R.El(data.N + 1); //Same as [n+1]

            System.Single g = 9.8f;// gravitational constant
            System.Single dt = 0.02f; // hardwired timestep
            System.Single dx = 1.0f;
            System.Single dy = 1.0f;
            long droploc = data.N / 4;

            var H = data.H;
            var U = data.U;
            var V = data.V;
            var Hx = data.Hx;
            var Ux = data.Ux;
            var Vx = data.Vx;
            var Hy = data.Hy;
            var Uy = data.Uy;
            var Vy = data.Vy;

            //Splash!!!
            H[droploc, droploc] += 5.0f;

            for (int i = 0; i < timesteps; i++)
            {
                H.Flush();
                // Reflecting boundary conditions
                H[ALL, FIRST] = H[ALL, SECOND];
                U[ALL, FIRST] = U[ALL, SECOND];
                V[ALL, FIRST] = -V[ALL, SECOND];
                H[ALL, RN1] = H[ALL, RN];
                U[ALL, RN1] = U[ALL, RN];
                V[ALL, RN1] = -V[ALL, RN];
                H[FIRST, ALL] = H[SECOND, ALL];
                U[FIRST, ALL] = -U[SECOND, ALL];
                V[FIRST, ALL] = V[SECOND, ALL];
                H[RN1, ALL] = H[RN, ALL];
                U[RN1, ALL] = -U[RN, ALL];
                V[RN1, ALL] = V[RN, ALL];

                //First half-step

                //Height
                Hx[ALL, R.Slice(0, -1)] = (H[SKIP1,INNER]+H[ZM1,INNER])/2 -
                        dt/(2*dx)*(U[SKIP1,INNER]-U[ZM1,INNER]);

                //x momentum
                Ux[ALL, R.Slice(0, -1)] = (U[SKIP1,INNER]+U[ZM1,INNER])/2 -
                dt/(2*dx)*((U[SKIP1,INNER].Pow(2)/H[SKIP1,INNER] +
                            g/2*H[SKIP1,INNER].Pow(2)) -
                           (U[ZM1,INNER].Pow(2)/H[ZM1,INNER] +
                            g/2*H[ZM1,INNER].Pow(2)));

                // y momentum
                Vx[ALL, ZM1] = (V[SKIP1,INNER]+V[ZM1,INNER])/2 -
                            dt/(2*dx)*((U[SKIP1,INNER] *
                            V[SKIP1,INNER]/H[SKIP1, INNER]) -
                           (U[ZM1,INNER] *
                            V[ZM1,INNER]/H[ZM1,INNER]));



                // height
                Hy[ZM1,ALL] = (H[INNER,SKIP1]+H[INNER,ZM1])/2 -
                            dt/(2*dy)*(V[INNER,SKIP1]-V[INNER,ZM1]);

                // x momentum
                Uy[ZM1,ALL] = (U[INNER,SKIP1]+U[INNER,ZM1])/2 -
                            dt/(2*dy)*((V[INNER,SKIP1] *
                            U[INNER,SKIP1]/H[INNER,SKIP1]) -
                           (V[INNER,ZM1] *
                            U[INNER,ZM1]/H[INNER,ZM1]));
                // y momentum
                Vy[ZM1,ALL] = (V[INNER,SKIP1]+V[INNER,ZM1])/2 -
                            dt/(2*dy)*((V[INNER,SKIP1].Pow(2)/H[INNER,SKIP1] +
                            g/2*H[INNER,SKIP1].Pow(2)) -
                           (V[INNER,ZM1].Pow(2)/H[INNER,ZM1] +
                            g/2*H[INNER,ZM1].Pow(2)));

                // Second half step

                // height
                H[INNER, INNER] = H[INNER, INNER] -
                            (dt/dx)*(Ux[SKIP1,ZM1]-Ux[ZM1,ZM1]) -
                            (dt/dy)*(Vy[ZM1,SKIP1]-Vy[ZM1, ZM1]);

                // x momentum
                U[INNER, INNER] = U[INNER, INNER] -
                           (dt/dx)*((Ux[SKIP1,ZM1].Pow(2)/Hx[SKIP1,ZM1] +
                             g/2*Hx[SKIP1,ZM1].Pow(2)) -
                            (Ux[ZM1,ZM1].Pow(2)/Hx[ZM1,ZM1] +
                             g / 2 * Hx[ZM1, ZM1].Pow(2))) -
                             (dt/dy)*((Vy[ZM1,SKIP1] *
                                       Uy[ZM1,SKIP1]/Hy[ZM1,SKIP1]) -
                                        (Vy[ZM1,ZM1] *
                                         Uy[ZM1,ZM1]/Hy[ZM1,ZM1]));
                // y momentum
                V[R.Slice(1, -1),R.Slice(1, -1)] = V[R.Slice(1, -1),R.Slice(1, -1)] -
                               (dt/dx)*((Ux[SKIP1,ZM1] *
                                         Vx[SKIP1,ZM1]/Hx[SKIP1,ZM1]) -
                                        (Ux[ZM1,ZM1]*Vx[ZM1,ZM1]/Hx[ZM1,ZM1])) -
                                        (dt / dy) * ((Vy[ZM1, SKIP1].Pow(2) / Hy[ZM1, SKIP1] +
                                                  g / 2 * Hy[ZM1, SKIP1].Pow(2)) -
                                                 (Vy[ZM1, ZM1].Pow(2) / Hy[ZM1, ZM1] +
                                                  g / 2 * Hy[ZM1, ZM1].Pow(2)));
            }
            //Make sure we have the actual data and use it as a checksum
            return NumCIL.Float.Add.Reduce(NumCIL.Float.Add.Reduce(H / data.N)).Value[0];
        }
    }
}