Because a website I need data from doesn't have any API or RSS feed for their service status, I use a Web Scraper I built using PHP to grab the data I need and structure it as JSON. However I want to make sure the script is as resilient as possible.
<?php
error_reporting(E_ALL & ~E_NOTICE);
header('Content-Type: application/json');
// Load the Cached json file
$json = json_decode(file_get_contents('service.json'));
// Calculate if the cahced json file isn't older than 5 minutes
$current_time = new DateTime('now');
@$cache_time = new DateTime($json->meta->cache_time);
$interval = $current_time->diff($cache_time, true);
if($interval->i > 5){
// If the cached json file is older than 5 minutes, make a request to grab a fresh copy
$ch = curl_init();
// Setting the curl options
curl_setopt($ch, CURLOPT_URL, "http://www.translink.ca/en/Schedules-and-Maps/Alerts.aspx");
// The page tries to set a cookie using JS to see if JS is enabled. Here I just trick it into thinking we have JS.
curl_setopt($ch, CURLOPT_COOKIE, "YPF8827340282Jdskjhfiw_928937459182JAX666=<my public ip>");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$res = curl_exec($ch);
curl_close($ch);
// Load everything up into a DOMDocument
$doc = new DOMDocument();
@$html = $doc->loadHTML($res);
// Set the metadata
$status = array();
$status['meta']['cached'] = true;
$status['meta']['cache_time'] = date("Y-m-d H:i:s");
// Grab the Current Conditions tab - #tab0
$tab0 = $doc->getElementById("tab0");
if(empty($tab0)){
die('{"error":"Unable to retrieve service status", "code":101}');
}
// Grab all of the table rows
$tr0 = $tab0->getElementsByTagName("tr");
$lastIndexedService = "";
// Begin to sort everything
foreach ($tr0 as $row) {
$class = $row->getAttribute('class');
// Each table row has two divs, one with the alert information and another with the description
if($class == "alertInfo"){
// Getting the specific information from the table
$status_td = $row->childNodes->item(1);
$service = $row->childNodes->item(2)->nodeValue;
$service = strtolower(str_replace(' ','',$service));
$summary = trim($status_td->childNodes->item(1)->nodeValue);
$severity = $row->childNodes->item(3)->nodeValue;
$status['data']['current'][$service]['status']['summary'] = $summary;
$status['data']['current'][$service]['status']['severity'] = $severity;
$status['data']['current'][$service]['status']['effective'] = $row->childNodes->item(4)->nodeValue;
// There is a line break in the effective line, we just fix that. Obviously this will need to be updated with its 2015.
$status['data']['current'][$service]['status']['effective'] = str_replace('2014', '2014 ', $status['data']['current'][$service]['status']['effective']);
$lastIndexedService = $service;
}
// Getting the Alert Description
if($class == "alertDesc" && $lastIndexedService !== ""){
$div = $row->getElementsByTagName("div");
$div = $div->item(0);
if(!empty($div->nodeValue)){
// We store it as html as there is formatting we'd like to preserve
$status['data']['current'][$lastIndexedService]['description'] = $doc->saveHTML($div);
}
}
}
// Store the fresh data into the cached file
file_put_contents('service.json', json_encode($status));
// Because we initially set the cached flag to true in at the top before we save it, we now flip it back.
$status['meta']['cached'] = false;
// Print out the results
echo(json_encode($status));
} else {
// Print out the cached results
echo(file_get_contents('service.json'));
}
?>