This formats HTML for use in a locally hosted iframe so that you can manipulate the content in the iframe freely, without running into cross domain issues. It uses Goutte to retrieve the HTML. I'd like to improve the code, whether it's to fit a certain design pattern or just more efficient. I don't feel like it's as clean as it could be.
<?php
class HTMLFixerClass {
public $client;
public $crawler;
public $url;
public $originalHTML;
public $validHTML;
public function __construct($url) {
$this->url = $url;
$this->client = new GoutteClient();
$this->crawler = $this->client->request('GET', $url);
if ($this->crawler->filter('html')->count()) {
$this->originalHTML = $this->crawler->filter('html')->html();
$this->validHTML = $this->correct_directories($this->originalHTML);
}
}
public function get_html() {
if ($this->validHTML !== '') {
return "<!DOCTYPE HTML>\n<html>\n" . $this->validHTML . "\n</html>";
} else {
return 'Invalid URL';
}
}
public function correct_directories($html) {
$temp_url = $this->url;
preg_match_all('/<.*?\.{2}\/.*?>/', $html, $matches);
$initial_matches = $matches;
if ($matches) {
//foreach match in the entire doc
for ($i = 0; $i < count($matches[0]); $i++) {
//foreach match in each match ex ../../
preg_match_all('/[\.]{2}[\/]/', $matches[0][$i], $sub_matches);
$replacement_string = '';
for ($j = 0; $j < count($sub_matches[0]); $j++) {
$replacement_string .= '../';
//reduces base url by one directory foreach ../ found in
if ($j == 0) {
preg_match_all('/.*(?=\/.*\/)/', $temp_url, $replacement_url);
} elseif ($j > 0) {
preg_match_all('/.*(?=.*\/)/', $replacement_url[0][0], $replacement_url);
}
}
//in the end $replacement_url[0] should be the desired url for appending to css or script in order to become absolute link
$matches[0][$i] = str_replace($replacement_string, $replacement_url[0][0] . '/', $matches[0][$i]);
}
for ($k = 0; $k < count($matches[0]); $k++) {
$html = str_replace($initial_matches[0][$k], $matches[0][$k], $html);
}
return $html;
} else {
var_dump($matches);
}
}
}