Multiplying square matrix with C pthreads (POSIX threads)

Question

I'm a student, and I'm trying to make the product of two square matrix with some threads in the soup. I've already worked on some fast single-threaded product (with some cache-friendly tricks), and I'm interested now on multithreading.

The following code works, but I don't fully understand my outputs in terms of performance.

I'd like to improve my code a bit and of course, improve my implementation (and in a perfect world, achieving a multithreaded version faster than a naive singlethreaded version).

Makefile:

CC=gcc
CCO=# -O3 -march=native
CCF=-std=gnu11 -Wall -Wextra -pedantic -Wno-unused
LDF=-pthread
EXTRA=-DNB_TH=8

all: modular B_bloc K_bloc bench

modular:
    $(CC) -Dmodular $(EXTRA) $(LDF) $(CCF) $(CCO) ./mat_mult.c -o modular

B_bloc:
    $(CC) -DB_bloc $(EXTRA) $(LDF) $(CCF) $(CCO) ./mat_mult.c -o B_bloc

K_bloc:
    $(CC) -DK_bloc $(EXTRA) $(LDF) $(CCF) $(CCO) ./mat_mult.c -o K_bloc

mr_proper:
    rm -f modular B_bloc K_bloc

bench:
    ./modular 128
    ./B_bloc 128
    ./K_bloc 128

    ./modular 208
    ./B_bloc 208
    ./K_bloc 208

    ./modular 400
    ./B_bloc 400
    ./K_bloc 400

    ./modular 1024
    ./B_bloc 1024
    ./K_bloc 1024

mat_mult.c:

#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>

// Pick your favorite flavor
// #define modular
// #define B_bloc
// #define K_bloc

#ifdef modular
    #define FOO_TH modular_
    #define MALLOC_P_TH default_alloc_
#elif defined(B_bloc)
    #define FOO_TH B_bloc_
    #define MALLOC_P_TH default_alloc_
#elif defined(K_bloc)
    #define FOO_TH K_bloc_
    #define MALLOC_P_TH K_bloc_alloc_
    #ifndef BLK
        #define BLK 2
    #endif
#endif

#ifndef BLK
    #define BLK 16
#endif
#ifndef NB_TH
    #define NB_TH 8
#endif

#define THREAD_WAIT 1
#define THREAD_OVER 0

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// TOOLS
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

// Tool to build a matrix
void build_mat_(int*** target, int width, int height)
{
    void* origin = malloc(sizeof(int*) * width + sizeof(int) * width * height);
    int** header = (int**) origin;
    int* cursor = (int*) (header + width);
    for(int i = 0; i < width; ++i)
    {
        header[i] = cursor;
        cursor += width;
    }

    *target = (int**) origin;
}

// Tool to allocate a struct-like void* thread parameter (default)
void* default_alloc_()
{
    return malloc(0
            + sizeof(int)   * 3
            + sizeof(int*)  * 1
            + sizeof(int**) * 3
            );
}

// Tool to allocate a struct-like void* thread parameter (K_bloc only)
void* K_bloc_alloc_()
{
    return malloc(0
            + sizeof(int)               * 3
            + sizeof(int*)              * 1
            + sizeof(int**)             * 3
            + sizeof(pthread_mutex_t*)  * 1
            );
}

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// THREADS FOO_TH
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

// Split the mult' by picking one-out-of NB_TH column.
void* modular_(void* data)
{
    // Data retrieving
    int* int_p = (int*) data;
    int width = int_p[0];
    int height = int_p[1];
    int id_th = int_p[2];
    int** vec_p = (int**) (int_p + 3);
    int* thread_status = vec_p[0];
    int*** mat_p = (int***) (vec_p + 1);
    int** A = mat_p[0];
    int** B = mat_p[1];
    int** C = mat_p[2];

    // Iterating on columns
    for(int i = 0; i < width; ++i)
        if(i % NB_TH == id_th)
            for(int j = 0; j < height; ++j)
                for(int k = 0; k < width; ++k)
                    C[i][j] += A[i][k] * B[k][j];

#ifdef UNJOIN
    for(;thread_status[-1];sleep(0))
        thread_status[id_th] = THREAD_OVER;
#else
    thread_status[id_th] = THREAD_OVER;
#endif

    pthread_exit(NULL);
}

// Split the mult' in many blocs on j axis. Cache-friendly on B.
void* B_bloc_(void* data)
{
    // Data retrieving
    int* int_p = (int*) data;
    int width = int_p[0];
    int height = int_p[1];
    int id_th = int_p[2];
    int** vec_p = (int**) (int_p + 3);
    int* thread_status = vec_p[0];
    int*** mat_p = (int***) (vec_p + 1);
    int** A = mat_p[0];
    int** B = mat_p[1];
    int** C = mat_p[2];

    // Calculating blocs size
    int blk_width = BLK;
    int cursor = id_th * blk_width;

    // Iterating on blocs
    while(cursor < width)
    {
        for(int i = 0; i < width; ++i)
            for(int j = cursor; j < cursor + blk_width; ++j)
                for(int k = 0; k < width; ++k)
                    C[i][j] += A[i][k] * B[k][j];

        cursor += NB_TH * blk_width;
    }

    // Various stuff and exit.
#ifdef UNJOIN
    for(;thread_status[-1];sleep(0))
        thread_status[id_th] = THREAD_OVER;
#else
    thread_status[id_th] = THREAD_OVER;
#endif

    pthread_exit(NULL);
}

// Split the mult' in blocs on i, j, and k axis. Ensure local data.
void* K_bloc_(void* data)
{
    // Data retrieving
    int* int_p = (int*) data;
    int width = int_p[0];
    int height = int_p[1];
    int id_th = int_p[2];
    int** vec_p = (int**) (int_p + 3);
    int* thread_status = vec_p[0];
    int*** mat_p = (int***) (vec_p + 1);
    int** A = mat_p[0];
    int** B = mat_p[1];
    int** C = mat_p[2];
    pthread_mutex_t** mut_p = (pthread_mutex_t**) (mat_p + 3);
    pthread_mutex_t* mutex = mut_p[0];

    // Calculating bloc size
    int blk_width = width / BLK;
    int blk_i = (id_th % BLK) * blk_width;
    int blk_j = ((id_th / BLK) % BLK) * blk_width;
    int blk_k = ((id_th / (BLK * BLK)) % BLK) * blk_width;
    int blk_id = id_th % (BLK * BLK);

    // Creating local matrix for saving local results
    int** C_local;
    build_mat_(&C_local, blk_width, blk_width);

    // Initializing matrix to 0
    for(int i = 0; i < blk_width; ++i)
        for(int j = 0; j < blk_width; ++j)
            C_local[i][j] = 0;

    // Iterating on the bloc
    for(int i = blk_i, i_l = 0; i < blk_i + blk_width; ++i, ++i_l)
        for(int j = blk_j, j_l = 0; j < blk_j + blk_width; ++j, ++j_l)
            for(int k = blk_k; k < blk_k + blk_width; ++k)
                C_local[i_l][j_l] += A[i][k] * B[k][j];

    // Locking the bloc on targeted matrix to write local results
    pthread_mutex_lock(mutex + blk_id);
    for(int i = blk_i, i_l = 0; i < blk_i + blk_width; ++i, ++i_l)
        for(int j = blk_j, j_l = 0; j < blk_j + blk_width; ++j, ++j_l)
            C[i][j] += C_local[i_l][j_l];
    pthread_mutex_unlock(mutex + blk_id);

    // Free memory
    free(C_local);

    // Various stuff and exit.
#ifdef UNJOIN
    for(;thread_status[-1];sleep(0))
        thread_status[id_th] = THREAD_OVER;
#else
    thread_status[id_th] = THREAD_OVER;
#endif

    pthread_exit(NULL);
}

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// MAIN
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

int main(int argc, char** argv)
{
#ifdef TM
    int width = TM;
    int height = width;
#else
    int width = atoi(argv[1]);
    int height = width;
#endif

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// CREATING MATRIX
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

    // Contiguous columns to help the cache
    int **A, **B, **C, **D;
    build_mat_(&A, width, height);
    build_mat_(&B, width, height);
    build_mat_(&C, width, height);
    build_mat_(&D, width, height);

    // Initialize them to various values
    for(int i = 0; i < width; ++i)
        for(int j = 0; j < height; ++j)
        {
            A[i][j] = (i * i) % (j + 1) + i;
            B[i][j] = (i * j) % (i + 1) + j;
            C[i][j] = 0;
            D[i][j] = 0;
        }

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// CALCULATE REFERENCE RESULT (NAIVE)
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

    // Naive calculus of the resulting matrix
    clock_t naive_start = clock();
    for(int i = 0; i < width; ++i)
        for(int j = 0; j < height; ++j)
            for(int k = 0; k < width; ++k)
                D[i][j] += A[i][k] * B[k][j];
    clock_t naive_end = clock();

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// PREPARING THREADS
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

    pthread_t thread_id[NB_TH];
    void* thread_p[NB_TH];

    // Thread status vector, mostly used with -DUNJOIN
    int* thread_status = (int*) malloc(sizeof(int) * (NB_TH + 1));
    for(int i = 0; i < (NB_TH + 1); ++i)
        thread_status[i] = THREAD_WAIT;
    ++thread_status;

    // Checking asserts and execute method-related initializations.
    assert(width == height);
#ifdef B_bloc
    assert(width % BLK == 0);
#elif defined(K_bloc)
    assert(NB_TH == BLK * BLK * BLK);
    assert(width % BLK == 0);

    pthread_mutex_t* K_bloc_mutex =
        malloc(sizeof(pthread_mutex_t) * (NB_TH / BLK));
    for(int i = 0; i < NB_TH / BLK; ++i)
        pthread_mutex_init(K_bloc_mutex + i, NULL);
#endif

    // Initialize thread's parameters.
    for(int i = 0; i < NB_TH; ++i)
    {
        thread_p[i] = MALLOC_P_TH();
        int* int_p = (int*) thread_p[i];
        int_p[0] = width;
        int_p[1] = height;
        int_p[2] = i;
        int** vec_p = (int**) (int_p + 3);
        vec_p[0] = thread_status;
        int*** mat_p = (int***) (vec_p + 1);
        mat_p[0] = A;
        mat_p[1] = B;
        mat_p[2] = C;
#ifdef K_bloc
        pthread_mutex_t** mut_p = (pthread_mutex_t**) (mat_p + 3);
        mut_p[0] = K_bloc_mutex;
#endif
    }

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// STARTING THREADS
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

    // Start the thread's timer
    clock_t thread_start = clock();
    for(int i = 0; i < NB_TH; ++i)
        pthread_create(&thread_id[i], NULL, FOO_TH, thread_p[i]);

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// WAITING THREADS
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
#ifndef UNJOIN
    for(int i = 0; i < NB_TH; ++i)
        pthread_join(thread_id[i], NULL);
#else
    for(int i, status = THREAD_WAIT; status == THREAD_WAIT;)
        for(i = 0, status = THREAD_OVER; i < NB_TH; ++i, sleep(0))
            status = status || thread_status[i];
#endif

    // End the thread's timer
    clock_t thread_end = clock();

    for(int i = 0; i < NB_TH; ++i)
        assert(thread_status[i] == THREAD_OVER);
    thread_status[-1] = THREAD_OVER;

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// CHECKING THE RESULT
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

    for(int i = 0; i < width; ++i)
        for(int j = 0; j < height; ++j)
            if(C[i][j] != D[i][j])
                return 1;

// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /
// FREEING MEMORY
// / // / // / // / // / // / // / // / // / // / // / // / // / // / // / // /

#ifdef K_bloc
    free(K_bloc_mutex);
#endif

    free(--thread_status);
    for(int i = 0; i < NB_TH; ++i)
        free(thread_p[i]);

    free(A);
    free(B);
    free(C);
    free(D);

    float naive_time = ((float) naive_end - naive_start) / CLOCKS_PER_SEC;
    float thread_time = ((float) thread_end - thread_start) / CLOCKS_PER_SEC;

    printf("%f\n", naive_time);
    printf("%f\n", thread_time);
    printf("%f\n", thread_time / naive_time);

    return 0;
}

Sample output:

gcc -Dmodular -DNB_TH=8 -pthread -std=gnu11 -Wall -Wextra -pedantic -Wno-unused  ./mat_mult.c -o modular
gcc -DB_bloc -DNB_TH=8 -pthread -std=gnu11 -Wall -Wextra -pedantic -Wno-unused  ./mat_mult.c -o B_bloc
gcc -DK_bloc -DNB_TH=8 -pthread -std=gnu11 -Wall -Wextra -pedantic -Wno-unused  ./mat_mult.c -o K_bloc
./modular 128
0.009941
0.009169
0.922342
./B_bloc 128
0.010069
0.008471
0.841295
./K_bloc 128
0.009771
0.010346
1.058848
./modular 208
0.038660
0.082287
2.128479
./B_bloc 208
0.038176
0.048373
1.267105
./K_bloc 208
0.037938
0.051802
1.365438
./modular 400
0.329782
0.502367
1.523331
./B_bloc 400
0.327145
0.501808
1.533901
./K_bloc 400
0.324983
0.638929
1.966038
./modular 1024
9.322891
16.263161
1.744433
./B_bloc 1024
9.136414
16.874943
1.846999
./K_bloc 1024
8.651233
15.378231
1.777577

Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers. — Ethan Bierlein, Mar 1 at 19:02

JS1 · Accepted Answer · 2016-03-01 09:04:42Z

Wrong loop order

In your naive matrix multiplication, you have your loops in a suboptimal ordering. I compared your implementation with this one, where I swapped the second and third loops:

for(int i = 0; i < width; ++i)
    for(int k = 0; k < width; ++k)
        for(int j = 0; j < height; ++j)
            D[i][j] += A[i][k] * B[k][j];

When I ran B_bloc 1024, I got these numbers (no threads involved):

Original loops: 9.89 sec
Swapped loops : 1.07 sec

So clearly you need to think about cache usage a little bit more. The version I listed here is more cache friendly because A[i][k] is constant in the inner loop while the other two matrices are being iterated in memory order. With the original loops where the inner loop iterated on k, the B[k][j] term was jumping cache lines on every loop iteration.

Use a struct!

The code you use to pass information between the main thread and the pthreads is not good at all. You are reinventing the wheel by constructing your own handcoded struct:

    // This is what MALLOC_P_TH() calls:
    void* default_alloc_()
    {
        return malloc(0
                + sizeof(int)   * 3
                + sizeof(int*)  * 1
                + sizeof(int**) * 3
                );
    }

    // In main():
    thread_p[i] = MALLOC_P_TH();
    int* int_p = (int*) thread_p[i];
    int_p[0] = width;
    int_p[1] = height;
    int_p[2] = i;
    int** vec_p = (int**) (int_p + 3);
    vec_p[0] = thread_status;
    int*** mat_p = (int***) (vec_p + 1);
    mat_p[0] = A;
    mat_p[1] = B;
    mat_p[2] = C;

You should be using an actual struct, like this:

typedef struct ThreadArgs {
    int width;
    int height;
    int threadNum;
    int *pStatus;
    int **matrixA;
    int **matrixB;
    int **matrixC;
} ThreadArgs;

// In main():
thread_p[i]            = malloc(sizeof(ThreadArgs));
thread_p[i]->width     = width;
thread_p[i]->height    = height;
thread_p[i]->threadNum = i;
thread_p[i]->pStatus   = thread_status;
thread_p[i]->matrixA   = A;
thread_p[i]->matrixB   = B;
thread_p[i]->matrixC   = C;

As you can see, this improves the code in many ways:

You don't have to have a custom function just to compute the size of the struct.
Your fields are actually named, so you can refer to each field by name rather than number.
If you need to add or remove fields, you simply make modifications to the struct rather than fiddling with the handcoded arrays.

Using the wrong clock function

When I ran your program, I was surprised to see that none of your multithreaded versions ran faster than the single threaded version. However, I discovered that you were using the wrong function to time your program. The clock() function measures cpu cycles used across all threads. So even if your program ran 8 times faster, you would still get the same reading from clock() as for a single threaded program. What you want is a wall clock measurement. I would suggest using something like clock_gettime() or gettimeofday(). After I modified your program to use clock_gettime() and fixed the loop orders, I ran B_bloc and got the following output, which makes a lot more sense (I have a 4 core system, so 0.25 on the third line is expected):

$ B_bloc 1024
1.102912
0.276752
0.250928

Wow ! I was using the wrong clock function… I found further reading on this mistake here : Using clock to measure time in multithreaded programs. — Dallas, Mar 1 at 11:32

asked	5 months ago
viewed	71 times
active	5 months ago

current community

your communities

more stack exchange communities

Multiplying square matrix with C pthreads (POSIX threads)

1 Answer 1

Wrong loop order

Use a struct!

Using the wrong clock function

Your Answer

Not the answer you're looking for? Browse other questions tagged performance c multithreading matrix pthreads or ask your own question.

Hot Network Questions

current community

your communities

more stack exchange communities

Multiplying square matrix with C pthreads (POSIX threads)

1 Answer 1

Wrong loop order

Use a struct!

Using the wrong clock function

Your Answer

Sign up or log in

Post as a guest

Not the answer you're looking for? Browse other questions tagged performance c multithreading matrix pthreads or ask your own question.

Related

Hot Network Questions