Tell me more ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

I've written a quick "local document crawler" that fetches the title tag and an expandable amount of metatag information from files on a webserver.

I develop in .net for a living and don't have a clue what I'm doing, but the site I'm helping with only has PHP hosting.

The goal is to gather metadata from files on a server, hopefully cache the output that uses the data, and display it to the user.

We experienced some x-files stuff when the first cache-file was written, and the system itself is rather slow, even when not recursing. (There's about 200 files being read in a request) The x-files stuff being php files disappearing from ftp view, which might be due to permissions being automatically set by the hosting provider.

Another thing I really don't understand is why some pages just don't seem to match my regular expressions for the metatags, so if anyone spots the issue there's bonus kudos. ;)

Feel free to give any pointers you can think of to the following code:

General class:

<?php
class MetaEnumerator
{
    private $patterns = array(
            "title" => "/<title>([^<]*)<\\x2Ftitle>/ix",
            "keywords" => '/<meta(?=[^>]*name="keywords")\s[^>$]*content="([^"]*)[">]*$/ixu',
            "description" => '/<meta(?=[^>]*name="description")\s[^>$]*content="([^"]*)[">]*$/ixu'
        );
    private $endPattern = "/<\/head>/ixu";

    private $path = "";
    private $recursive = false;
    private $files = null;

    function __construct($path, $recursive) {
        $this->path = $path;
        $this->recursive = $recursive;
    }

    public function AddPattern($key, $pattern)
    {
        $this->patterns[$key] = $pattern;
    }

    public function GetFiles()
    {
        $this->files = array();
        $this->AddItems($this->path);
        usort($this->files, array("MetaEnumerator", "CompareTitle"));
        return $this->files;
   }

    private static function CompareTitle($a, $b) {
        return strcmp($a["title"], $b["title"]);
    }

    private function AddItems($path)
    {
        foreach(scandir($path) as $item) {
            $this->AddItem($path, $item);
        }
    }

    private function AddItem($path, $item)
    {
        $fullPath = "$path/$item";
        if ($this->IsFolder($fullPath, $item) && $this->recursive) {
            $this->AddItems($fullPath);
        }
        else if ($this->IsHtmlFile($fullPath)) {
            $this->AddFile($fullPath);
        }
    }

    private function AddFile($fullPath)
    {
        $fileInfo = $this->GetFileInfo($fullPath);
        array_push($this->files, $fileInfo);
    }

    private function GetFileInfo($file)
    {
        $fileInfo = array();
        $fileInfo["path"] = $file;
        $fileInfo["modified"] = filemtime($file);
        $ptr = fopen($file, "r");
        foreach ($this->patterns as $key => $value) {
            $fileInfo[$key] = $this->FindPattern($ptr, $value);
        }
        fclose($ptr);
        return $fileInfo;
    }

    private function FindPattern($ptr, $pattern)
    {
        $retVal = "";
        rewind($ptr);
        while (($line = fgets($ptr)) !== FALSE) {
            if (preg_match($pattern, $line) > 0) {
                $retVal = preg_replace($pattern, "$1", $line);
                break;
            }
            if (preg_match($this->endPattern, $line) > 0) {
                break;
            }
        }
        return $retVal;
    }

    private function IsFolder($path, $item)
    {
        return is_dir($path) && $this->IsPhysical($item);
    }

    private function IsPhysical($folderPath) {
        return $folderPath !== "." && $folderPath !== "..";
    }

    private function IsHtmlFile($filePath)
    {
        $pathInfo = pathinfo($filePath);
        return !is_dir($filePath) && $pathInfo["extension"] == "html";
    }
}

A page using it: (This hasn't been refactored yet, so lay off with the clean code comments. ;) )

<?
include "../../../utils/MetaEnumerator.php";

$files = scandir("..");
$maxDate = null;
foreach($files as $file) {
    $date = filemtime("../$file");
    if ($maxDate == null || $date > $maxDate) {
        $maxDate = $date;
    }
}

$cacheFile = "thispage.cache";
$cacheDate = file_exists($cacheFile) ? filemtime($cacheFile)  : null;
if ($cacheDate >= $maxDate) {
    include($cacheFile);
    exit;
}
else
{
    ob_start();
?>
<html>
<head>
    <title>Our stuff</title>
</head>
<body>
<?
    echo date("d.m.Y",$maxDate);

    function AddTag($enumerator, $name) {
        $metaPrefix = '/<meta(?=[^>]*name="';
        $metaSuffix = '")\s[^>$]*content="([^"]*)[">]*$/ixu';
        $enumerator->AddPattern($name, $metaPrefix.$name.$metaSuffix);
    }

    $enumerator = new MetaEnumerator("..", false);
    AddTag($enumerator, "name");
    AddTag($enumerator, "country");
    AddTag($enumerator, "status");
    AddTag($enumerator, "active");
    $files = $enumerator->GetFiles();

    echo "<table>";
    echo "<tr>";
    echo "<th>Name</th>".
        "<th>Country</th>".
        "<th>Status</th>".
        "<th>Last update</th>";
    echo "</tr>";
    foreach($files as $file) {
        if ($file["name"] == null) continue;
        echo "<tr style=\"vertical-align: top;\">";
        echo "<td><a href=\"".$file["path"]."\" target=\"_blank\">".$file["name"]."</a></td>".
            "<td>".$file["country"]."</td>".
            "<td>".$file["eruption"]."</td>".
            "<td>".date("d.m.Y", $file["modified"])."</td>";
        echo "</tr>";
    }
    echo "</table>";
?>
</body>
</html>
<?
$fp = fopen($cacheFile, 'w');
fwrite($fp, ob_get_contents());
fclose($fp);
ob_end_flush();
}
?>
share|improve this question
Why use PHP to crawl document? Wouldn't the parsing be much faster if you do with with BASH or other language? (PHP is more like server side scripting, but will not parse fastest i believe) – YumYumYum Dec 30 '11 at 12:15
Since the only server tech. available on the given site is PHP and the point is to be able to deploy a html file to a subfolder, and get overview/list/news/sitemap pages updated automatically. Requirements, requirements. ;) – Lars-Erik Jan 3 '12 at 8:35

Know someone who can answer? Share a link to this question via email, Google+, Twitter, or Facebook.

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged or ask your own question.