Sign up ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

Because a website I need data from doesn't have any API or RSS feed for their service status, I use a Web Scraper I built using PHP to grab the data I need and structure it as JSON. However I want to make sure the script is as resilient as possible.

<?php

error_reporting(E_ALL & ~E_NOTICE);
header('Content-Type: application/json');

// Load the Cached json file
$json = json_decode(file_get_contents('service.json'));

// Calculate if the cahced json file isn't older than 5 minutes
$current_time = new DateTime('now');
@$cache_time = new DateTime($json->meta->cache_time);
$interval = $current_time->diff($cache_time, true);
if($interval->i > 5){

    // If the cached json file is older than 5 minutes, make a request to grab a fresh copy
    $ch = curl_init();

    // Setting the curl options
    curl_setopt($ch, CURLOPT_URL, "http://www.translink.ca/en/Schedules-and-Maps/Alerts.aspx");

    // The page tries to set a cookie using JS to see if JS is enabled.  Here I just trick it into thinking we have JS.
    curl_setopt($ch, CURLOPT_COOKIE, "YPF8827340282Jdskjhfiw_928937459182JAX666=<my public ip>");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
    $res = curl_exec($ch);

    curl_close($ch);

    // Load everything up into a DOMDocument
    $doc = new DOMDocument();
    @$html = $doc->loadHTML($res);

    // Set the metadata
    $status = array();
    $status['meta']['cached'] = true;
    $status['meta']['cache_time'] = date("Y-m-d H:i:s");

    // Grab the Current Conditions tab - #tab0
    $tab0 = $doc->getElementById("tab0");
    if(empty($tab0)){
        die('{"error":"Unable to retrieve service status", "code":101}');
    }

    // Grab all of the table rows
    $tr0 = $tab0->getElementsByTagName("tr");
    $lastIndexedService = "";
    // Begin to sort everything
    foreach ($tr0 as $row) {
        $class = $row->getAttribute('class');

        // Each table row has two divs, one with the alert information and another with the description
        if($class == "alertInfo"){

            // Getting the specific information from the table
            $status_td = $row->childNodes->item(1);
            $service = $row->childNodes->item(2)->nodeValue;
            $service = strtolower(str_replace(' ','',$service));
            $summary = trim($status_td->childNodes->item(1)->nodeValue);
            $severity = $row->childNodes->item(3)->nodeValue;
            $status['data']['current'][$service]['status']['summary'] = $summary;
            $status['data']['current'][$service]['status']['severity'] = $severity;
            $status['data']['current'][$service]['status']['effective'] = $row->childNodes->item(4)->nodeValue;

            // There is a line break in the effective line, we just fix that.  Obviously this will need to be updated with its 2015.
            $status['data']['current'][$service]['status']['effective'] = str_replace('2014', '2014 ', $status['data']['current'][$service]['status']['effective']);
            $lastIndexedService = $service;
        }

        // Getting the Alert Description
        if($class == "alertDesc" && $lastIndexedService !== ""){
            $div = $row->getElementsByTagName("div");
            $div = $div->item(0);
            if(!empty($div->nodeValue)){

                // We store it as html as there is formatting we'd like to preserve
                $status['data']['current'][$lastIndexedService]['description'] = $doc->saveHTML($div);
            }
        }
    }

    // Store the fresh data into the cached file
    file_put_contents('service.json', json_encode($status));

    // Because we initially set the cached flag to true in at the top before we save it, we now flip it back.
    $status['meta']['cached'] = false;

    // Print out the results
    echo(json_encode($status));
} else {

    // Print out the cached results
    echo(file_get_contents('service.json'));
}
?>
share|improve this question
    
This is my first post to Code Review, so if I am doing something wrong in regards to this post please let me know and I will try my best to fix it. – ecnepsnai Aug 7 '14 at 5:12
    
Haven't looked at your code much, but you seem to be doing things that are far better done using PhantomJS or selenium... headless browsers, essentially – Elias Van Ootegem Aug 7 '14 at 12:27

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged or ask your own question.