I've written a quick "local document crawler" that fetches the title tag and an expandable amount of metatag information from files on a webserver.
I develop in .net for a living and don't have a clue what I'm doing, but the site I'm helping with only has PHP hosting.
The goal is to gather metadata from files on a server, hopefully cache the output that uses the data, and display it to the user.
We experienced some x-files stuff when the first cache-file was written, and the system itself is rather slow, even when not recursing. (There's about 200 files being read in a request) The x-files stuff being php files disappearing from ftp view, which might be due to permissions being automatically set by the hosting provider.
Another thing I really don't understand is why some pages just don't seem to match my regular expressions for the metatags, so if anyone spots the issue there's bonus kudos. ;)
Feel free to give any pointers you can think of to the following code:
General class:
<?php
class MetaEnumerator
{
private $patterns = array(
"title" => "/<title>([^<]*)<\\x2Ftitle>/ix",
"keywords" => '/<meta(?=[^>]*name="keywords")\s[^>$]*content="([^"]*)[">]*$/ixu',
"description" => '/<meta(?=[^>]*name="description")\s[^>$]*content="([^"]*)[">]*$/ixu'
);
private $endPattern = "/<\/head>/ixu";
private $path = "";
private $recursive = false;
private $files = null;
function __construct($path, $recursive) {
$this->path = $path;
$this->recursive = $recursive;
}
public function AddPattern($key, $pattern)
{
$this->patterns[$key] = $pattern;
}
public function GetFiles()
{
$this->files = array();
$this->AddItems($this->path);
usort($this->files, array("MetaEnumerator", "CompareTitle"));
return $this->files;
}
private static function CompareTitle($a, $b) {
return strcmp($a["title"], $b["title"]);
}
private function AddItems($path)
{
foreach(scandir($path) as $item) {
$this->AddItem($path, $item);
}
}
private function AddItem($path, $item)
{
$fullPath = "$path/$item";
if ($this->IsFolder($fullPath, $item) && $this->recursive) {
$this->AddItems($fullPath);
}
else if ($this->IsHtmlFile($fullPath)) {
$this->AddFile($fullPath);
}
}
private function AddFile($fullPath)
{
$fileInfo = $this->GetFileInfo($fullPath);
array_push($this->files, $fileInfo);
}
private function GetFileInfo($file)
{
$fileInfo = array();
$fileInfo["path"] = $file;
$fileInfo["modified"] = filemtime($file);
$ptr = fopen($file, "r");
foreach ($this->patterns as $key => $value) {
$fileInfo[$key] = $this->FindPattern($ptr, $value);
}
fclose($ptr);
return $fileInfo;
}
private function FindPattern($ptr, $pattern)
{
$retVal = "";
rewind($ptr);
while (($line = fgets($ptr)) !== FALSE) {
if (preg_match($pattern, $line) > 0) {
$retVal = preg_replace($pattern, "$1", $line);
break;
}
if (preg_match($this->endPattern, $line) > 0) {
break;
}
}
return $retVal;
}
private function IsFolder($path, $item)
{
return is_dir($path) && $this->IsPhysical($item);
}
private function IsPhysical($folderPath) {
return $folderPath !== "." && $folderPath !== "..";
}
private function IsHtmlFile($filePath)
{
$pathInfo = pathinfo($filePath);
return !is_dir($filePath) && $pathInfo["extension"] == "html";
}
}
A page using it: (This hasn't been refactored yet, so lay off with the clean code comments. ;) )
<?
include "../../../utils/MetaEnumerator.php";
$files = scandir("..");
$maxDate = null;
foreach($files as $file) {
$date = filemtime("../$file");
if ($maxDate == null || $date > $maxDate) {
$maxDate = $date;
}
}
$cacheFile = "thispage.cache";
$cacheDate = file_exists($cacheFile) ? filemtime($cacheFile) : null;
if ($cacheDate >= $maxDate) {
include($cacheFile);
exit;
}
else
{
ob_start();
?>
<html>
<head>
<title>Our stuff</title>
</head>
<body>
<?
echo date("d.m.Y",$maxDate);
function AddTag($enumerator, $name) {
$metaPrefix = '/<meta(?=[^>]*name="';
$metaSuffix = '")\s[^>$]*content="([^"]*)[">]*$/ixu';
$enumerator->AddPattern($name, $metaPrefix.$name.$metaSuffix);
}
$enumerator = new MetaEnumerator("..", false);
AddTag($enumerator, "name");
AddTag($enumerator, "country");
AddTag($enumerator, "status");
AddTag($enumerator, "active");
$files = $enumerator->GetFiles();
echo "<table>";
echo "<tr>";
echo "<th>Name</th>".
"<th>Country</th>".
"<th>Status</th>".
"<th>Last update</th>";
echo "</tr>";
foreach($files as $file) {
if ($file["name"] == null) continue;
echo "<tr style=\"vertical-align: top;\">";
echo "<td><a href=\"".$file["path"]."\" target=\"_blank\">".$file["name"]."</a></td>".
"<td>".$file["country"]."</td>".
"<td>".$file["eruption"]."</td>".
"<td>".date("d.m.Y", $file["modified"])."</td>";
echo "</tr>";
}
echo "</table>";
?>
</body>
</html>
<?
$fp = fopen($cacheFile, 'w');
fwrite($fp, ob_get_contents());
fclose($fp);
ob_end_flush();
}
?>