How would you improve this code? Particularly the check and check_all functions?
The time complexity of the algorithm of mlcs is \$O(|\Sigma|MN)\$, where \$\Sigma\$ is the alphabet, M is the number of strings and N is the length of the strings. Is that right?
My analysis: candidates() performs \$O(|\Sigma|M)\$ operations and is called \$O(N)\$ times.
Based on the reviewed code posted before at Multiple longest common subsequence (another algorithm)
def check(string, seq):
i = 0
j = 0
while i < len(string) and j < len(seq):
if string[i] == seq[j]:
j += 1
i += 1
return len(seq) - j
def checkall(strings, seq):
for x in strings:
a = check(x, seq)
if not a == 0:
print(x, seq, a)
return False
return True
def mlcs(strings):
"""Return a long common subsequence of the strings.
"""
if not strings:
raise ValueError("mlcs() argument is an empty sequence")
strings = list(set(strings)) # deduplicate
alphabet = set.intersection(*(set(s) for s in strings))
# indexes[letter][i] is list of indexes of letter in strings[i].
indexes = {letter:[[] for _ in strings] for letter in alphabet}
for i, s in enumerate(strings):
for j, letter in enumerate(s):
if letter in alphabet:
indexes[letter][i].append(j)
# Generate candidate positions for next step in search.
def candidates():
for letter, letter_indexes in indexes.items():
candidate = []
for ind in letter_indexes:
if len(ind) < 1:
break
q = ind[0]
candidate.append(q)
else:
yield candidate
result = []
while True:
try:
# Choose the closest candidate position, if any.
pos = None
for c in candidates():
if not pos or sum(c) < sum(pos):
pos = c
letter = strings[0][pos[0]]
except TypeError:
return ''.join(result)
for let, letter_indexes in indexes.items():
for k, ind in enumerate(letter_indexes):
ind = [i for i in ind if i > pos[k]]
letter_indexes[k] = ind
result.append(letter)
strings = []
# Alphabet for the strings.
sigma = ["a", "b", "c", "d"]
# Alphabet for the LCS.
sigmax = ["e", "f", "g"]
import random
Nx = 67 # Length of LCS.
N = 128 # Length of strings. N >= Nx.
M = 128 # Number of strings.
x = ""
for _ in range(Nx):
x += random.choice(sigmax)
for _ in range(M):
string = ""
for _ in range(N):
string += random.choice(sigma)
indexes = list(range(N))
random.shuffle(indexes)
indexes = sorted(indexes[:len(x)])
for j in range(len(x)):
string = string[:indexes[j]]+x[j]+string[indexes[j]+1:]
strings += [string]
#strings = ["abbab", "ababa", "abbba"]
#strings = ["abab", "baba", "abaa"]
#strings = ["bacda", "abcde", "decac"]
#strings = ["babbabbb", "bbbaabaa", "abbbabab", "abbababa"]
#strings = ["ab", "aba"]
#print("Strings:")
#print(strings)
l = mlcs(strings)
print("LCS:")
print(l, len(l), checkall(strings, l))