This code (an implementation of the path finding Needleman-Wunsch algorithm), given two sequences, correctly aligns and calculates the similarity of them. For example, given two sequences:
AAA,CCC
or
AA,CA
it correctly aligns them as
---AAA CCC---
and
-AA CA-
respectively, while returning the similarity.
% Matlab implementation of NW algorithm
% Written by Joseph Farah
% Completed 7/14/16
% Last updated 7/14/16
function similarity = needleman(s1, s2)
% defining variables
% sequence related variables
s1 = strcat('-',s1);
s2 = strcat('-',s2);
seq1 = s1;
seq2 = s2;
%seq1 = '-ACTTAGATTACTTG';
%seq2 = '-CCCACTAGATTATTAG';
fseq1 = '';
fseq2 = '';
% weights
w_s = 2;
w_indel = 1;
% temporary costs for the matrix fill
h_cost = 0;
v_cost = 0;
d_cost = 0;
% the matrices
cost_matrix = zeros(length(seq1),length(seq2));
direction_cell_matrix = cell(length(seq1),length(seq2));
%% Initiliazation and Scoring
for x = 1:length(seq1)
for y = 1:length(seq2)
point = [seq1(x), seq2(y)];
if point(1) == '-' && point(2) == '-'
cost_matrix(x,y) = 0;
direction_cell_matrix{x,y} = 'n';
continue
end
% apparently, to get it to go in the right order, moving
% horizontally is defined by y-1 and vertically by x-1
if point(1) == '-' % this means it is on the lip --
cost_matrix(x,y) = cost_matrix(x,y-1) + w_indel;
% the only way it can go is left!
direction_cell_matrix{x,y} = 'h';
continue
end
if point(2) == '-' % is it vertical?
cost_matrix(x,y) = cost_matrix(x-1,y) + w_indel;
% the only way it can go is up!
direction_cell_matrix{x,y} = 'v';
continue
end
% if none of the above conditions were met, the point must be in
% the body of the array. First thing we have to do is calculate the
% cost of the current point.
h_cost = cost_matrix(x,y-1) + w_indel;
v_cost = cost_matrix(x-1,y) + w_indel;
% the diagonal cost will be determined by this next bit--do the
% characters match?
if point(1) == point(2)
% if they match, there is no cost!
d_cost = cost_matrix(x-1,y-1) + 0;
elseif point(1) ~= point(2)
% if they are a mismatch, the cost is the subsitution weight
d_cost = cost_matrix(x-1,y-1) + w_s;
end
% now lets compare them! time to put them in a list...
min_list_cost = [h_cost, v_cost, d_cost];
minimum = min(min_list_cost);
% now we have a cost, and we put it in the matrix!
cost_matrix(x,y) = minimum;
% one last step-- we need to calculate the optimal direction
% to do this, we have to find the minimum of the surround paths.
% Back to the cost variables!
h_cost = cost_matrix(x,y-1);
v_cost = cost_matrix(x-1,y);
d_cost = cost_matrix(x-1,y-1);
% its important to keep track of the order, because the index of
% the minimum value of the list will give us the correct direction
% for the point!
min_list_direction = [h_cost, v_cost, d_cost];
% now we find the minumum AND the index. Without the index, we
% dont know which way to go!
% minimum = min(min_list_direction);
% now that we have a minumum, which direction is it in?
% we went h, v, d
index = find(min_list_cost == minimum);
if index(1) == 1
direction_cell_matrix{x,y} = 'h';
elseif index(1) == 2
direction_cell_matrix{x,y} = 'v';
elseif index(1) == 3
direction_cell_matrix{x,y} = 'd';
end
end
end
%%Traceback
% defining variables for how much we are deviating from the end point
% the final sequence at its worst will be as long as the first sequence
% plus the second sequence. The loop cannot go on for longer than that.
x = length(seq2);
y = length(seq1);
% the coordinate variables above will be updated so that moving through the
% direction matrix will match moving through the sequences
for ii = 1:(length(seq1)+length(seq2))
if direction_cell_matrix{y,x} == 'n'
break
end
if direction_cell_matrix{y,x} == 'h'
fseq2 = strcat(seq2(x),fseq2);
fseq1 = strcat('-',fseq1);
y = y;
x = x - 1;
elseif direction_cell_matrix{y,x} == 'v'
fseq2 = strcat('-',fseq2);
fseq1 = strcat(seq1(y),fseq1);
y = y - 1;
x = x;
elseif direction_cell_matrix{y,x} == 'd'
fseq2 = strcat(seq2(x),fseq2);
fseq1 = strcat(seq1(y),fseq1);
y = y - 1;
x = x - 1;
end
end
sim_score = 0;
for ii=1:length(fseq1)
if fseq1(ii) == fseq2(ii)
sim_score = sim_score + 1;
else
continue
end
end
similarity = sim_score/length(fseq1);
What can I do to improve:
- Efficiency?
- Readability?
- the Matlab-iness of the code? (i.e where can I take advantage of tools Matlab has to offer?)