Mxmul

Python Numpy

from __future__ import print_function
from benchpress import util
import numpy as np

def main():

    B = util.Benchmark()
    try:
        N,I = B.size                            # Grab command-line arguments
    except ValueError:
        N,I = (B.size[0], 1)
    nelements = B.dtype(N*N)

    # Load or create matrices
    if B.inputfn:
        arrays = B.load_arrays()
        x = arrays['x']
        y = arrays['y']
    else:
        x = np.arange(nelements, dtype=B.dtype)/nelements
        x.shape = (N, N)

        y = np.arange(N**2, dtype=B.dtype)/nelements
        y.shape = (N, N)

    if B.dumpinput:
        B.dump_arrays("mxmul", {'x': x, 'y': y})

    C = np.zeros_like(x)

    # Do the matrix multiplication
    B.start()
    for _ in range(I):
        C += np.add.reduce(x[:,np.newaxis] * np.transpose(y), -1)
        B.flush()
    #R = np.dot(x, y)
    B.stop()

    # Print / dump
    B.pprint()
    if B.outputfn:
        B.tofile(B.outputfn, {'res': C})
    if B.verbose:
        print(np.sum(C))

if __name__ == "__main__":
    main()

C99 Omp

#include <stdlib.h>
#include <stdio.h>
#include <bp_util.h>

void matmul(double* a, double* b, double* c, int m, int n, int k)
{
    #pragma omp parallel for collapse(2)
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            double sum = 0;
            for (int t = 0; t < k; t++) {
                sum += a[i*m +t] * b[t*n + j];
            }
            c[i*n+j] = sum;
        }
    }
}

int main(int argc, char* argv[])
{
    bp_util_type bp = bp_util_create(argc, argv, 1);        // Grab arguments
    if (bp.args.has_error) {
        return 1;
    }
    bp.timer_start();

    const int n = bp.args.sizes[0];
    const int nelements = n*n;
    
    double* a = (double*)malloc(sizeof(double)*nelements);  // Data setup
    double* b = (double*)malloc(sizeof(double)*nelements);
    double* c = (double*)malloc(sizeof(double)*nelements);
    for (int i=0; i<nelements; ++i) {
        a[i] = i / (float)nelements;
        b[i] = i / (float)nelements;
    }

    bp.timer_start();                                       // Start timer
    matmul(a, b, c, n, n, n);
    bp.timer_stop();

    bp.print("mxmul(c99_omp)");
    if (bp.args.verbose) {                                  // and values.
        double r = 0.0;
        #pragma omp parallel for reduction(+:r)
        for(int i=0; i<nelements; ++i) {
            r += c[i];
        }
        printf("Result = %.10f\n", r);
    }
    free(a);
    free(b);
    free(c);

    return 0;
}

C99 Seq

#include <stdlib.h>
#include <stdio.h>
#include <bp_util.h>

void matmul(double* a, double* b, double* c, int m, int n, int k)
{
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            double sum = 0;
            for (int t = 0; t < k; t++) {
                sum += a[i*m +t] * b[t*n + j];
            }
            c[i*n+j] = sum;
        }
    }
}

int main(int argc, char* argv[])
{
    bp_util_type bp = bp_util_create(argc, argv, 1);        // Grab arguments
    if (bp.args.has_error) {
        return 1;
    }
    bp.timer_start();

    const int n = bp.args.sizes[0];
    const int nelements = n*n;
    
    double* a = (double*)malloc(sizeof(double)*nelements);  // Data setup
    double* b = (double*)malloc(sizeof(double)*nelements);
    double* c = (double*)malloc(sizeof(double)*nelements);
    for (int i=0; i<nelements; ++i) {
        a[i] = i / (float)nelements;
        b[i] = i / (float)nelements;
    }

    bp.timer_start();                                       // Start timer
    matmul(a, b, c, n, n, n);
    bp.timer_stop();

    bp.print("mxmul(c99_seq)");
    if (bp.args.verbose) {                                  // and values.
        double r = 0.0;
        for(int i=0; i<nelements; ++i) {
            r += c[i];
        }
        printf("Result = %.10f\n", r);
    }
    free(a);
    free(b);
    free(c);

    return 0;
}

Cpp11 Bxx

#include <iostream>
#include <iomanip>
#include <bxx/bohrium.hpp>
#include <bp_util.h>

using namespace std;
using namespace bxx;

int main(int argc, char* argv[])
{
    bp_util_type bp = bp_util_create(argc, argv, 1);// Grab arguments
    if (bp.args.has_error) {
        return 1;
    }
    const int n = bp.args.sizes[0];
    const double nelements = n*n;

    multi_array<double> a, b, c;                    // Construct matrices
    a = view_as<double>(range<double>(n*n) / nelements, n, n);
    b = view_as<double>(range<double>(n*n) / nelements, n, n);

    Runtime::instance().flush();
    bp.timer_start();                               // Start timer
    c = matmul(a, b);                               // Run benchmark
    Runtime::instance().flush();
    bp.timer_stop();                                // Stop timer

    bp.print("mxmul(cpp11_bxx)");				    // Print results..
    if (bp.args.verbose) {                          // ..and value.
        cout << fixed << setprecision(10)
			 << "Result = " << scalar<double>(sum(c)) << endl;
    }

    return 0;
}

Cpp11 Omp

#include <iostream>
#include <iomanip>
#include <bp_util.h>

#define N 2000

using namespace std;

template <typename T>
void matmul(T a, T b, T c, int m, int n, int k)
{
    #pragma omp parallel for collapse(2)
    for (int i = 0; i < m; i++) {
        for (int j = 0; j < n; j++) {
            double sum = 0;
            for (int t = 0; t < k; t++) {
                sum += a[i][t] * b[t][j];
            }
            c[i][j] = sum;
        }
    }
}

int main(int argc, char* argv[])
{
    bp_util_type bp = bp_util_create(argc, argv, 1);    // Grab arguments
    if (bp.args.has_error) {
        return 1;
    }
    bp.timer_start();

    const int n = bp.args.sizes[0];
    const int nelements = n*n;

    if (n != N) {
        cout << "HEJ " << n << " " << N << endl;
    }
    
    auto a = new double[N][N];  // Data setup
    auto b = new double[N][N];
    auto c = new double[N][N];
    int range = 0;
    for (int i=0; i<n; ++i) {
        for (int j=0; j<n; ++j) {
            a[i][j] = (range++) / (float)nelements;
            b[i][j] = (range) / (float)nelements;
        }
    }

    bp.timer_start();                                       // Start timer
    matmul(a, b, c, n, n, n);
    bp.timer_stop();

    bp.print("mxmul(cpp11_omp)");
    if (bp.args.verbose) {                                  // and values.
        double r = 0.0;
        #pragma omp parallel for collapse(2) reduction(+:r)
        for(int i=0; i<n; ++i) {
            for(int j=0; j<n; ++j) {
                r += c[i][j];
            }
        }
        cout << fixed << setprecision(10)
			 << "Result = " << r << endl;
    }

    delete a; 
    delete b; 
    delete c; 

    return 0;
}