I have code that parses through web pages finds commentaries and saves commentary info in DB. I have an array where all necessary pages are stored. I iterate through all these pages one by one and find HMTL element which contains all the info about commentary if there are multiple pages with commentaries I iterate through all of them. I also check each word spelling with dictionary and calculate correctness of the whole commentary. Then I save all the info + some additional info into my DB. I am using Simple HTML DOM parser.
The code is messy and quite slow and I would like to speed it up by optimizing it.
<?php
header('Content-Type: text/html; charset=Windows-1252');
header('Content-type: text/html; charset=utf-8');
include('connect.php');
set_time_limit(0);
include('simple_html_dom.php');
//array with links to pages
$array = array();
foreach ($array as $value){
$nextLink = $value;
// Parse each link
while ($nextLink) {
//finds class with commentary info
$html=file_get_html($nextLink);
$articles = $html->find('div [class=article-comment]');
// iterate throught each commentary class
foreach($articles as $article) {
// finds commentary text
$content = $article->find('div[class=article-comment-content]',0)->innertext;
// Dictionary
$broker = enchant_broker_init();
$tag = 'lv_LV';
enchant_broker_set_dict_path($broker, ENCHANT_MYSPELL, 'C:\wamp\bin\php\php5.5.12');
//check spellnig of each word
if (enchant_broker_dict_exists($broker, $tag)) {
$dict = enchant_broker_request_dict($broker, $tag);
//skip elements for check
$trimmed_str = str_replace(['.', ',', '?', '!', ':', '"',')', '(', '*', '-', '–','','+','-','%',';',"'",'=','0','1','2','3','4','5','6','7','8','9','/'], ' ', $content);
$word_count = str_word_count($trimmed_str, 0, 'ēūīāšģķļžčņĒŪĪĀŠĢĶĻŽČŅ');
$word_array = preg_split('/\s+/', $trimmed_str);
//corect words count
$correct_words = 0;
//array for incorrect words
$incorrect_words = array();
//check words, if incorrect -> add to the array else add to count 1
foreach ($word_array as $word_value ){
$isCorrectlySpelled = enchant_dict_check($dict, $word_value);
if ($isCorrectlySpelled !== true) {
array_push($incorrect_words, $word_value);
}
else {
$correct_words +=1;
}
}
//incorrect words are stored in the array and separated with comma
$comma_separated = implode(",", $incorrect_words);
//calclate corectness of each word
$correctness = round(($correct_words/$word_count)*100,2);
}
enchant_broker_free($broker);
// author of commentary
$author = $article->find('strong[class=article-comment-author]',0)->innertext;
$date = $article->find('time[datetime]',0)->outertext;
$str = $html->getElementById("someId")->outertext;
$var = preg_split('/datetime=\"/', $date);
$var1 = preg_split('/\"/',$var[1]);
//likes and dislikes
$voteUP = $article->find('a[class=article-comment-vote plus]',0)->innertext;
$voteDown = $article->find('a[class=article-comment-vote minus]',0)->innertext;
//save info in DB
$sql = strip_tags("INSERT INTO commentstable(comment, author, date, VoteUP, VoteDown, word_count, correct_words, correctness, incorrect, articleID) VALUES ('$content', '$author', '$var1[0]', '$voteUP', '$voteDown','$word_count','$correct_words', '$correctness', '$comma_separated', '')");
if($sql) {
if(mysqli_query($connection, $sql)){
echo "Records added successfully.";
}
else{
echo "ERROR: Could not able to execute $insert. " . mysqli_error($link);
}
}
}
//If there are no more pages with comments
$next_stop = $html->find('a[class=pageing-button-next disable]',0);
if($next_stop){
break;
}
// open next page
$nextLink = ( ($temp = $html->find("a[class=pageing-button-next]", 0)) ? "http://somepage.com".$temp->href : NULL );
// Clear DOM object
$html->clear();
unset($html);
}
}