I've designed an algorithm to find the longest common subsequence. This is how it works:
Initially i = 0
- Picks the first letter from the first string starting from the ith letter.
- Goes to the second string looking for that picked letter.
- If not found returns to the first string and picks the next letter and repeats 1 to 3 until it finds a match in the second string.
- Now that found a common letter in the second string, adds it to
common_subsequence
. - Stores its position in
index
. - Picks next letter from the first string and do step 2 but this time starts from
index
. - Repeats 3 to 6 until it reaches to the end of string 1 or string 2.
- If the length of
common_subsequence
is greater than the length of common subsequence that found so far, assignscommon_subsequence
tolcs
. - Increment the value of i and repeats 1 to 9 until i is equal to the length of the first string.
Here is an example:
X=A, B, C, B, D, A, B
Y=B, D, C, A, B, A
- First picks
A
. - Looks for
A
inY
. - Now that found
A
adds it to the end ofcommon_subsequence
. - Picks
B
fromX
. - Looks for
B
inY
but this time starts searching fromA
. - Picks
C
. It dosen't exist in string 2, so picks the next letter inX
that isB
.
...
...
...
The complexity of this algorithm is theta(n*m).
I implemented it on two methods. The second one uses a hash table, but after implementing I found that it's much slower compared to the first algorithm. I can't understand why.
Here is my implementation:
First algorithm:
import time
def lcs(xstr, ystr):
if not (xstr and ystr): return # if string is empty
lcs = [''] # longest common subsequence
lcslen = 0 # length of longest common subsequence so far
for i in xrange(len(xstr)):
cs = '' # common subsequence
start = 0 # start position in ystr
for item in xstr[i:]:
index = ystr.find(item, start) # position at the common letter
if index != -1: # if common letter is found
cs += item # add common letter to the cs
start = index + 1
if index == len(ystr) - 1: break # if reached to the end of ystr
# updates lcs and lcslen if found better cs
if len(cs) > lcslen: lcs, lcslen = [cs], len(cs)
elif len(cs) == lcslen: lcs.append(cs)
return lcs
file1 = open('/home/saji/file1')
file2 = open('/home/saji/file2')
xstr = file1.read()
ystr = file2.read()
start = time.time()
lcss = lcs(xstr, ystr)
elapsed = (time.time() - start)
print elapsed
Second one using hash table:
import time
from collections import defaultdict
def lcs(xstr, ystr):
if not (xstr and ystr): return # if strings are empty
lcs = [''] # longest common subsequence
lcslen = 0 # length of longest common subsequence so far
location = defaultdict(list) # keeps track of items in the ystr
i = 0
for k in ystr:
location[k].append(i)
i += 1
for i in xrange(len(xstr)):
cs = '' # common subsequence
index = -1
reached_index = defaultdict(int)
for item in xstr[i:]:
for new_index in location[item][reached_index[item]:]:
reached_index[item] += 1
if index < new_index:
cs += item # add item to the cs
index = new_index
break
if index == len(ystr) - 1: break # if reached to the end of ystr
# update lcs and lcslen if found better cs
if len(cs) > lcslen: lcs, lcslen = [cs], len(cs)
elif len(cs) == lcslen: lcs.append(cs)
return lcs
file1 = open('/home/saji/file1')
file2 = open('/home/saji/file2')
xstr = file1.read()
ystr = file2.read()
start = time.time()
lcss = lcs(xstr, ystr)
elapsed = (time.time() - start)
print elapsed