Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

Sign up
Here's how it works:
  1. Anybody can ask a question
  2. Anybody can answer
  3. The best answers are voted up and rise to the top

Below is my implementation of the dynamic programming solution to the sequence alignment problem in C++11:

#include <iostream>
#include <string>
#include <vector>

using namespace std;

const size_t alphabets = 26;

/* 
 * Returns the Needleman-Wunsch score for the best alignment of a and b
 * and stores the aligned sequences in a_aligned and b_aligned
 */
int align(const string &a, const string &b, int alpha_gap, 
        int alpha[alphabets][alphabets], string &a_aligned, 
        string &b_aligned);

void print2DVector(const vector<vector<int> > &A);

int min(int a, int b, int c);

int main()
{
    // The input strings that need to be aligned
    string a1 = "AACAGTTACC";
    string b1 = "TAAGGTCA";

    // Penalty for any alphabet matched with a gap
    int gap_penalty = 2;

    /* 
     * alpha[i][j] = penalty for matching the ith alphabet with the
     *               jth alphabet.
     * Here: Penalty for matching an alphabet with anoter one is 1 
     *       Penalty for matching an alphabet with itself is 0
     */
    int alpha[alphabets][alphabets];
    for (size_t i = 0; i < alphabets; ++i)
    {
        for (size_t j = 0; j < alphabets; ++j)
        {
            if (i == j) alpha[i][j] = 0;
            else alpha[i][j] = 1;
        }
    }

    // Aligned sequences
    string a2, b2;
    int penalty = align(a1, b1, gap_penalty, alpha, a2, b2);

    cout << "a: " << a1 << endl;
    cout << "b: " << b1 << endl;
    cout << "Needleman-Wunsch Score: " << penalty << endl;
    cout << "Aligned sequences: " << endl;
    cout << a2 << endl;
    cout << b2 << endl;

    return 0;
}


int align(const string &a, const string &b, int alpha_gap, 
        int alpha[alphabets][alphabets], string &a_aligned, 
        string &b_aligned)
{
    size_t n = a.size();
    size_t m = b.size();

    vector<vector<int> > A(n + 1, vector<int>(m + 1));

    for (size_t i = 0; i <= m; ++i)
        A[0][i] = alpha_gap * i;
    for (size_t i = 0; i <= n; ++i)
        A[i][0] = alpha_gap * i;

    for (size_t i = 1; i <= n; ++i)
    {
        for (size_t j = 1; j <= m; ++j)
        {
            char x_i = a[i-1];
            char y_j = b[j-1];
            A[i][j] = min(A[i-1][j-1] + alpha[x_i - 'A'][y_j - 'A'],
                          A[i-1][j] + alpha_gap,
                          A[i][j-1] + alpha_gap);
        }
    }

    // print2DVector(A);

    a_aligned = "";
    b_aligned = "";
    size_t j = m;
    size_t i = n;
    for (; i >= 1 && j >= 1; --i)
    {
        char x_i = a[i-1];
        char y_j = b[j-1];
        if (A[i][j] == A[i-1][j-1] + alpha[x_i - 'A'][y_j - 'A'])
        {
            /*
             * I think prepending chars this way to a std::string is very inefficient.
             * Is there any better way of doing this without using C-style strings?
             */
            a_aligned = x_i + a_aligned;
            b_aligned = y_j + b_aligned;
            --j;
        }
        else if (A[i][j] == A[i-1][j] + alpha_gap)
        {
            a_aligned = x_i + a_aligned;
            b_aligned = '-' + b_aligned;
        }
        else
        {
            a_aligned = '-' + a_aligned;
            b_aligned = y_j + b_aligned;
            --j;
        }
    }

    while (i >= 1 && j < 1)
    {
        a_aligned = a[i-1] + a_aligned;
        b_aligned = '-' + b_aligned;
        --i;
    }
    while (j >= 1 && i < 1)
    {
        a_aligned = '-' + a_aligned;
        b_aligned = b[j-1] + b_aligned;
        --j;
    }

    return A[n][m];
}


int min(int a, int b, int c)
{
    if (a <= b && a <= c)
        return a;
    else if (b <= a && b <= c)
        return b;
    else
        return c;
}


void print2DVector(const vector<vector<int> > &A)
{
    for (auto i : A)
    {
        for (auto j : i)
            cout << j << " ";
        cout << endl;
    }
}

Here's the output:

a: AACAGTTACC
b: TAAGGTCA
Needleman-Wunsch Score: 7
Aligned sequences: 
AACAGTTACC
TA-AGGT-CA

How can I further improve upon my code in terms of efficiency and elegance?

share|improve this question
1  
Please include the description for what the code does within the question itself. – Simon Forsberg Jul 23 '15 at 15:57
up vote 1 down vote accepted

min

You wrote your own version of min that is the smallest of 3 arguments. But there is already one: std::min. Use it:

A[i][j] = std::min({A[i-1][j-1] + alpha[x_i - 'A'][y_j - 'A'],
                    A[i-1][j] + alpha_gap,
                    A[i][j-1] + alpha_gap});

Also introducing x_i and y_j is very confusing there. Just use a[i-1] and b[j-1].

print

Your print2DVector is doing a lot of unnecessary copies. The outer loop should be:

for (auto& i : A)

Note the reference!

alpha

You pass in an array but there's really no need for such a thing. Instead you should prefer to take some sort of weight class that is callable with two letters and returns the weight:

struct SimpleWeight {
    int operator()(char a, char b) const {
        return a == b ? 0 : 1;
    }
};

template <typename Weight>
int align(..., Weight weights, ...) {
    ...
    A[i][j] = std::min({A[i-1][j-1] + weights(a[i-1], b[j-1]),
                        ...});

    ...
}

signature

Your function is returning 3 things: the total weight, and the two aligned sequences. So it should return 3 things. C++11 has tuples, and tuples are much better than reference out parameters:

template <typename Weight>
std::tuple<int, std::string, std::string> 
align(const string &a, const string &b,
      int alpha_gap, Weight weights);

building the aligned strings

Since you have to go back-to-front to generate the aligned strings, you should just write them in reverse order and then reverse them. That will be much more efficient (O(N) instead of O(N2)). Just reserve the space up front:

std::string a_aligned;
std::string b_aligned;

a_aligned.reserve(a.size() + b.size());
b_aligned.reserve(a.size() + b.size());

// ...

std::reverse(a_aligned.begin(), a_aligned.end());
std::reverse(b_aligned.begin(), b_aligned.end());

return std::make_tuple(A[n][m], a_aligned, b_aligned);

no need to loop

while (i >= 1 && j < 1)
{
    a_aligned = a[i-1] + a_aligned;
    b_aligned = '-' + b_aligned;
    --i;
}

So if j == 0, we're appending i -s to b_aligned and the first i characters of a to a_aligned. That's just:

b_aligned += std::string(i, '-');
a_aligned += a.substr(0, i);
std::reverse(a.end() - i, a.end());
share|improve this answer
    
Thanks a lot. I couldn't understand the 'Weight class' part, could you please elaborate? Also, I later noticed that for (const auto &i : A) would be better for print2DVector. – doge Jul 23 '15 at 17:24

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Not the answer you're looking for? Browse other questions tagged or ask your own question.