I am writing a text classifier, and in order to do so, I need TF/IDF values per every word of my single text.
Then I need to use the cosine similarity:
$$similarity = cos(\theta) = \dfrac{A \cdot B}{\lVert A \lVert \lVert B \lVert} = \dfrac{\overset{n}{\underset{i=1}{\LARGE\Sigma}}A_i \times B_i}{\sqrt{ \overset{n}{\underset{i=1}{\LARGE\Sigma}}(A_i)^2} \times \sqrt{\overset{n}{\underset{i=1}{\LARGE\Sigma}}(B_i)^2}}$$
This requires processing of a big data storage (all of the texts that already exists in my database). The problem is that my code is doing his job for about 2 hours (quite too long) and breaks giving me message that I have run out of memory. I think that the main method might be very, very unoptimised.
public static void CreateCategoryClasses()
{
deserializeClasses = Deserialize();
howManyClasses = deserializeClasses.Count;
ewhClass = new EventWaitHandle[howManyClasses];
for (var i = 0; i < 5; ++i)
{
categoryClasses.Enqueue(new ConcurrentDictionary<string, double>());
result.Enqueue(new ConcurrentDictionary<string, double>());
}
WaitCallback threadMethod = ParseCategories;
ThreadPool.SetMaxThreads(howManyStudents, howManyClasses);
for (var i = 0; i < howManyClasses; ++i)
{
ewhClass[i] = new EventWaitHandle(false, EventResetMode.AutoReset);
ThreadPool.QueueUserWorkItem(threadMethod, i);
}
for (var i = 0; i < howManyClasses; ++i)
{
ewhClass[i].WaitOne();
}
threadMethod = AddIDF;
ThreadPool.SetMaxThreads(howManyStudents, 5);
for (var i = 0; i < 5; ++i)
{
ewhClass[i] = new EventWaitHandle(false, EventResetMode.AutoReset);
ThreadPool.QueueUserWorkItem(threadMethod, i);
}
for (var i = 0; i < 5; ++i)
{
ewhClass[i].WaitOne();
}
List<List<SingleWords>> efekt = new List<List<SingleWords>>(5);
for (int i = 0; i < 5; ++i)
{
foreach (var secondWord in categoryClasses.ToList()[i])
{
result[i].Add(new SingleWords(secondWord.Key, secondWord.Value));
}
}
var xmls = new XmlSerializer(typeof(List<List<SingleWords>>));
using (var sw = new StreamWriter(@"categoryClasses.xml"))
{
xmls.Serialize(sw, efekt);
}
}
private static void AddIDF(object index)
{
Console.WriteLine("Thread started with id:" + index + " and number: " + Thread.CurrentThread.ManagedThreadId);
var i = index as int?;
double sum;
foreach (var word in categoryClasses.ElementAt(i.Value))
{
sum =
deserializeClasses.Count(
clas =>
clas.Bag.Where(x => clas.Category == ((Categories)i.Value).ToString())
.Contains(new Words(word.Key, 0, 0)));
var temp = Convert.ToDouble(sum) /
Convert.ToDouble(
deserializeClasses.Count(x => x.Category == ((Categories)i.Value).ToString()));
result.ElementAt(i.Value).AddOrUpdate(word.Key, temp, (s, d) => temp);
}
ewhClass[(i).Value].Set();
Console.WriteLine("Thread ended with id:" + index + " and number: " + Thread.CurrentThread.ManagedThreadId);
}