I have this task to find the longest identical phrase which appears in 2 given files. With small files (~500KB each) it works just fine, ~10 secs to finish. But with larger files (~5MB each) it took ~43 mins to complete. The text in larger files are just copied and pasted over and over again. And I want to improve that if possible.
public static void LongestPhrase(string Book1, string Book2, ref string Phrase, ref int WIndex1, ref int WIndex2)
{
string[] Words1 = Book1.Split('-');
string[] Words2 = Book2.Split('-');
string Fragment = string.Empty;
int Counter1 = 0, Counter2 = 0;
Dictionary<string, List<int>> Index1 = new Dictionary<string, List<int>>();
Dictionary<string, List<int>> Index2 = new Dictionary<string, List<int>>();
List<string> Repeated = new List<string>();
List<string> Word1 = new List<string>();
List<string> Word2 = new List<string>();
foreach(string Nfo in Words1)
{
if (!Word1.Contains(Nfo.ToLower()))
Word1.Add(Nfo.ToLower());
}
foreach (string Nfo in Words2)
{
if (!Word2.Contains(Nfo.ToLower()))
Word2.Add(Nfo.ToLower());
}
foreach(string Nfo in Word1)
{
if (Word2.Contains(Nfo.ToLower()))
Repeated.Add(Nfo.ToLower());
}
for (int i = 0; i < Words1.Length; i++)
{
if (Repeated.Contains(Words1[i].ToLower()))
{
if (!Index1.ContainsKey(Words1[i].ToLower()))
{
Index1.Add(Words1[i].ToLower(), new List<int>());
Index1[Words1[i].ToLower()].Add(i);
}
else
Index1[Words1[i].ToLower()].Add(i);
}
}
for (int x = 0; x < Words2.Length; x++)
{
if (Repeated.Contains(Words2[x].ToLower()))
{
if (!Index2.ContainsKey(Words2[x].ToLower()))
{
Index2.Add(Words2[x].ToLower(), new List<int>());
Index2[Words2[x].ToLower()].Add(x);
}
else
Index2[Words2[x].ToLower()].Add(x);
}
}
foreach (string Nfo in Repeated.OrderByDescending(a => a.Length))
{
foreach (int Inf in Index1[Nfo])
{
Counter1 = Inf;
foreach (int Infs in Index2[Nfo])
{
Counter2 = Infs;
int Start1 = 0, Start2 = 0;
Fragment = Words1[Counter1];
for (Start1 = Counter1 + 1, Start2 = Counter2 + 1; Start1 < Words1.Length && Start2 < Words2.Length; Start1++, Start2++)
{
if (Words1[Start1].ToLower() == Words2[Start2].ToLower())
Fragment += " " + Words1[Start1];
else
break;
}
if (Fragment.Length > Phrase.Length)
{
Phrase = Fragment;
WIndex1 = Counter1;
WIndex2 = Counter2;
}
}
}
}
}
static void Main(string[] args)
{
var watch = Stopwatch.StartNew();
Console.WriteLine("A");
string Book1 = File.ReadAllText("Book1.txt", Encoding.GetEncoding(1257));
string Book2 = File.ReadAllText("Book2.txt", Encoding.GetEncoding(1257));
string UnfilteredBook1 = Regex.Replace(Book1, @"\s+", "-", RegexOptions.Singleline);
string UnfilteredBook2 = Regex.Replace(Book2, @"\s+", "-", RegexOptions.Singleline);
string Phrase = string.Empty;
int WordIndex1 = 0, WordIndex2 = 0;
LongestPhrase(UnfilteredBook1, UnfilteredBook2, ref Phrase, ref WordIndex1, ref WordIndex2);
watch.Stop();
var elapsedMs = watch.ElapsedMilliseconds;
Console.WriteLine("-> {0}", elapsedMs);
}
What I do in my function is this:
- Make 2 lists from both books containing all different words
- Make a list of words which are in both books
- Find all positions of repeated words
- Go through all repeated words through all found positions and search for phrase
I think 1-3 steps are all right and that 4th could be improved. I just don't have any ideas how. Should I change the approach to find longest phrase?
Dictionary<string, List<int>>
is a bomb of inefficiency. Dictionaries are very nice for small sizes; but when dealing with relevant amounts of data they are not good. Needless to say that theList<int>
part makes this version particularly aggressive. An almost-working-always way to notably improve the performance of a code dealing with relevant amounts of data is replacing all the dictionaries with more efficient collections (arrays); ideally, all the redundant (global) intermediate storage should be removed by properly analysing the algorithm. – varocarbas 4 hours ago