Implementation of Sequence Alignment in C++

Question

Below is my implementation of the dynamic programming solution to the sequence alignment problem in C++11:

#include <iostream>
#include <string>
#include <vector>

using namespace std;

const size_t alphabets = 26;

/* 
 * Returns the Needleman-Wunsch score for the best alignment of a and b
 * and stores the aligned sequences in a_aligned and b_aligned
 */
int align(const string &a, const string &b, int alpha_gap, 
        int alpha[alphabets][alphabets], string &a_aligned, 
        string &b_aligned);

void print2DVector(const vector<vector<int> > &A);

int min(int a, int b, int c);

int main()
{
    // The input strings that need to be aligned
    string a1 = "AACAGTTACC";
    string b1 = "TAAGGTCA";

    // Penalty for any alphabet matched with a gap
    int gap_penalty = 2;

    /* 
     * alpha[i][j] = penalty for matching the ith alphabet with the
     *               jth alphabet.
     * Here: Penalty for matching an alphabet with anoter one is 1 
     *       Penalty for matching an alphabet with itself is 0
     */
    int alpha[alphabets][alphabets];
    for (size_t i = 0; i < alphabets; ++i)
    {
        for (size_t j = 0; j < alphabets; ++j)
        {
            if (i == j) alpha[i][j] = 0;
            else alpha[i][j] = 1;
        }
    }

    // Aligned sequences
    string a2, b2;
    int penalty = align(a1, b1, gap_penalty, alpha, a2, b2);

    cout << "a: " << a1 << endl;
    cout << "b: " << b1 << endl;
    cout << "Needleman-Wunsch Score: " << penalty << endl;
    cout << "Aligned sequences: " << endl;
    cout << a2 << endl;
    cout << b2 << endl;

    return 0;
}


int align(const string &a, const string &b, int alpha_gap, 
        int alpha[alphabets][alphabets], string &a_aligned, 
        string &b_aligned)
{
    size_t n = a.size();
    size_t m = b.size();

    vector<vector<int> > A(n + 1, vector<int>(m + 1));

    for (size_t i = 0; i <= m; ++i)
        A[0][i] = alpha_gap * i;
    for (size_t i = 0; i <= n; ++i)
        A[i][0] = alpha_gap * i;

    for (size_t i = 1; i <= n; ++i)
    {
        for (size_t j = 1; j <= m; ++j)
        {
            char x_i = a[i-1];
            char y_j = b[j-1];
            A[i][j] = min(A[i-1][j-1] + alpha[x_i - 'A'][y_j - 'A'],
                          A[i-1][j] + alpha_gap,
                          A[i][j-1] + alpha_gap);
        }
    }

    // print2DVector(A);

    a_aligned = "";
    b_aligned = "";
    size_t j = m;
    size_t i = n;
    for (; i >= 1 && j >= 1; --i)
    {
        char x_i = a[i-1];
        char y_j = b[j-1];
        if (A[i][j] == A[i-1][j-1] + alpha[x_i - 'A'][y_j - 'A'])
        {
            /*
             * I think prepending chars this way to a std::string is very inefficient.
             * Is there any better way of doing this without using C-style strings?
             */
            a_aligned = x_i + a_aligned;
            b_aligned = y_j + b_aligned;
            --j;
        }
        else if (A[i][j] == A[i-1][j] + alpha_gap)
        {
            a_aligned = x_i + a_aligned;
            b_aligned = '-' + b_aligned;
        }
        else
        {
            a_aligned = '-' + a_aligned;
            b_aligned = y_j + b_aligned;
            --j;
        }
    }

    while (i >= 1 && j < 1)
    {
        a_aligned = a[i-1] + a_aligned;
        b_aligned = '-' + b_aligned;
        --i;
    }
    while (j >= 1 && i < 1)
    {
        a_aligned = '-' + a_aligned;
        b_aligned = b[j-1] + b_aligned;
        --j;
    }

    return A[n][m];
}


int min(int a, int b, int c)
{
    if (a <= b && a <= c)
        return a;
    else if (b <= a && b <= c)
        return b;
    else
        return c;
}


void print2DVector(const vector<vector<int> > &A)
{
    for (auto i : A)
    {
        for (auto j : i)
            cout << j << " ";
        cout << endl;
    }
}

Here's the output:

a: AACAGTTACC
b: TAAGGTCA
Needleman-Wunsch Score: 7
Aligned sequences: 
AACAGTTACC
TA-AGGT-CA

How can I further improve upon my code in terms of efficiency and elegance?

Please include the description for what the code does within the question itself. — Simon Forsberg♦, Jul 23 '15 at 15:57

Barry · Accepted Answer · 2015-07-23 16:26:09Z

min

You wrote your own version of min that is the smallest of 3 arguments. But there is already one: std::min. Use it:

A[i][j] = std::min({A[i-1][j-1] + alpha[x_i - 'A'][y_j - 'A'],
                    A[i-1][j] + alpha_gap,
                    A[i][j-1] + alpha_gap});

Also introducing x_i and y_j is very confusing there. Just use a[i-1] and b[j-1].

print

Your print2DVector is doing a lot of unnecessary copies. The outer loop should be:

for (auto& i : A)

Note the reference!

alpha

You pass in an array but there's really no need for such a thing. Instead you should prefer to take some sort of weight class that is callable with two letters and returns the weight:

struct SimpleWeight {
    int operator()(char a, char b) const {
        return a == b ? 0 : 1;
    }
};

template <typename Weight>
int align(..., Weight weights, ...) {
    ...
    A[i][j] = std::min({A[i-1][j-1] + weights(a[i-1], b[j-1]),
                        ...});

    ...
}

signature

Your function is returning 3 things: the total weight, and the two aligned sequences. So it should return 3 things. C++11 has tuples, and tuples are much better than reference out parameters:

template <typename Weight>
std::tuple<int, std::string, std::string> 
align(const string &a, const string &b,
      int alpha_gap, Weight weights);

building the aligned strings

Since you have to go back-to-front to generate the aligned strings, you should just write them in reverse order and then reverse them. That will be much more efficient (O(N) instead of O(N²)). Just reserve the space up front:

std::string a_aligned;
std::string b_aligned;

a_aligned.reserve(a.size() + b.size());
b_aligned.reserve(a.size() + b.size());

// ...

std::reverse(a_aligned.begin(), a_aligned.end());
std::reverse(b_aligned.begin(), b_aligned.end());

return std::make_tuple(A[n][m], a_aligned, b_aligned);

no need to loop

while (i >= 1 && j < 1)
{
    a_aligned = a[i-1] + a_aligned;
    b_aligned = '-' + b_aligned;
    --i;
}

So if j == 0, we're appending i -s to b_aligned and the first i characters of a to a_aligned. That's just:

b_aligned += std::string(i, '-');
a_aligned += a.substr(0, i);
std::reverse(a.end() - i, a.end());

Thanks a lot. I couldn't understand the 'Weight class' part, could you please elaborate? Also, I later noticed that for (const auto &i : A) would be better for print2DVector. — doge, Jul 23 '15 at 17:24

asked	11 months ago
viewed	559 times
active	11 months ago

current community

your communities

more stack exchange communities

Implementation of Sequence Alignment in C++

1 Answer 1

Your Answer

Not the answer you're looking for? Browse other questions tagged c++ c++11 dynamic-programming or ask your own question.

Hot Network Questions

current community

your communities

more stack exchange communities

Implementation of Sequence Alignment in C++

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Not the answer you're looking for? Browse other questions tagged c++ c++11 dynamic-programming or ask your own question.

Related

Hot Network Questions