diff --git a/CHANGELOG.md b/CHANGELOG.md index 206af61..8f459ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# v1.4.3 +## 10/31/2016 + +3. [](#bugfix) + * Fixed [#11](https://github.com/Sommerregen/grav-plugin-external-links/issues/11) (Not working with cache enabling) (see PR [#15](https://github.com/Sommerregen/grav-plugin-external-links/pull/15)) + * Fixed [#13](https://github.com/Sommerregen/grav-plugin-external-links/issues/13) (Preferences Not Showing) + # v1.4.2 ## 12/06/2015 diff --git a/LICENSE b/LICENSE index 731ce27..e9bda35 100644 --- a/LICENSE +++ b/LICENSE @@ -27,7 +27,7 @@ using Grav External Links Plugin in any way. MIT LICENSE ----------- -Copyright (c) 2015 Benjamin Regler, https://github.com/sommerregen/grav-plugin-external-links +Copyright (c) 2016 Benjamin Regler, https://github.com/sommerregen/grav-plugin-external-links Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index f5b9fd4..ccff283 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,7 @@ Thanks! ## License -Copyright (c) 2015 [Benjamin Regler][github]. See also the list of [contributors] who participated in this project. +Copyright (c) 2016 [Benjamin Regler][github]. See also the list of [contributors] who participated in this project. [Dual-licensed](LICENSE) for use under the terms of the [MIT][mit-license] or [GPLv3][gpl-license] licenses. diff --git a/blueprints.yaml b/blueprints.yaml index c9c1c37..c383828 100644 --- a/blueprints.yaml +++ b/blueprints.yaml @@ -1,5 +1,5 @@ name: External Links -version: 1.4.2 +version: 1.4.3 description: "This plugin adds small icons to external and mailto links, informing users the link will take them to a new site or open their email client." icon: external-link author: diff --git a/classes/ExternalLinks.php b/classes/ExternalLinks.php index 312f324..6ff840d 100644 --- a/classes/ExternalLinks.php +++ b/classes/ExternalLinks.php @@ -11,7 +11,7 @@ namespace Grav\Plugin; use Grav\Common\Utils; -use Grav\Common\GravTrait; +use Grav\Common\Grav; /** * External Links @@ -21,465 +21,464 @@ */ class ExternalLinks { - /** - * @var ExternalLinks - */ - use GravTrait; - - /** ------------- - * Public methods - * -------------- - */ - - /** - * Process contents i.e. apply filer to the content. - * - * @param string $content The content to render. - * @param array $options Options to be passed to the renderer. - * @param null|Page $page Null or an instance of \Grav\Common\Page. - * - * @return string The rendered contents. - */ - public function render($content, $options = [], $page = null) - { - // Get all tags and process them - $content = preg_replace_callback('~]*>.*?~i', - function($match) use ($options, $page) { - // Load PHP built-in DOMDocument class - if (($dom = $this->loadDOMDocument($match[0])) === null) { - return $match[0]; - } + /** + * @var ExternalLinks + */ + + /** ------------- + * Public methods + * -------------- + */ + + /** + * Process contents i.e. apply filer to the content. + * + * @param string $content The content to render. + * @param array $options Options to be passed to the renderer. + * @param null|Page $page Null or an instance of \Grav\Common\Page. + * + * @return string The rendered contents. + */ + public function render($content, $options = [], $page = null) + { + // Get all tags and process them + $content = preg_replace_callback('~]*>.*?~i', + function($match) use ($options, $page) { + // Load PHP built-in DOMDocument class + if (($dom = $this->loadDOMDocument($match[0])) === null) { + return $match[0]; + } + + $a = $dom->getElementsByTagName('a')->item(0); + + // Process links with non-empty href attribute + $href = $a->getAttribute('href'); + if (strlen($href) == 0) { + return $match[0]; + } + + // Get the class of the element + $class = $a->hasAttribute('class') ? $a->getAttribute('class') : ''; + $classes = array_filter(explode(' ', $class)); + + // Exclude links with specific class from processing + $exclude = $options->get('exclude.classes', null); + if ($exclude && !!array_intersect($exclude, $classes)) { + return $match[0]; + } + + // Get domains to be seen as internal + $domains = $options->get('exclude.domains', []); + + // This is a mailto link. + if (strpos($href, 'mailto:') === 0) { + $classes[] = 'mailto'; + } + + // The link is external + elseif ($url = $this->isExternalUrl($href, $domains, $page)) { + // Add external class + $classes[] = 'external-link'; + $a->setAttribute('href', $url); + + // Add target="_blank" + $target = $options->get('target'); + if ($target) { + $a->setAttribute('target', $target); + } + + // Add no-follow. + $nofollow = $options->get('no_follow'); + if ($nofollow) { + $rel = array_filter(explode(' ', $a->getAttribute('rel'))); + if (!in_array('nofollow', $rel)) { + $rel[] = 'nofollow'; + $a->setAttribute('rel', implode(' ', $rel)); + } + } + + // Add image class to if it has at least one child element + $imgs = $a->getElementsByTagName('img'); + if ($imgs->length > 1) { + // Add "images" class to element, if it has multiple child images + $classes[] = 'images'; + } elseif ($imgs->length == 1) { + $imgNode = $imgs->item(0); + + // Get image size + list($width, $height) = $this->getImageSize($imgNode); + + // Determine maximum dimension of image size + $size = max($width, $height); + + // Depending on size determine image type + $classes[] = ((0 < $size) && ($size <= 32)) ? 'icon' : 'image'; + } else { + // Add "no-image" class to element, if it has no child images + $classes[] = 'no-image'; + } + + // Add title (aka alert text) + if ($options->get('title')) { + $language = Grav::instance()['language']; + $message = $language->translate(['PLUGINS.EXTERNAL_LINKS.TITLE_MESSAGE']); + + // Set default title to link else, set title as data attribute + $key = $a->hasAttribute('title') ? 'data-title' : 'title'; + $a->setAttribute($key, $message); + } + } - $a = $dom->getElementsByTagName('a')->item(0); + // Set class attribute + if (count($classes) && ($options->get('mode') === 'active')) { + $a->setAttribute('class', implode(' ', $classes)); + } - // Process links with non-empty href attribute - $href = $a->getAttribute('href'); - if (strlen($href) == 0) { - return $match[0]; - } + // Save Dom document back to HTML representation + $html = $this->saveDOMDocument($dom); + return $html; + }, $content); - // Get the class of the element - $class = $a->hasAttribute('class') ? $a->getAttribute('class') : ''; - $classes = array_filter(explode(' ', $class)); + // Write content back to page + return $content; + } - // Exclude links with specific class from processing - $exclude = $options->get('exclude.classes', null); - if ($exclude && !!array_intersect($exclude, $classes)) { - return $match[0]; + /** ------------------------------- + * Private/protected helper methods + * -------------------------------- + */ + + /** + * Test if a URL is external + * + * @param string $url The URL to test. + * @param array $domains An array of domains to be seen as internal. + * @param null|Page $page Null or an instance of \Grav\Common\Page. + * + * @return mixed Returns the URL as a string, if it is external, + * false otherwise. + */ + protected function isExternalUrl($url, $domains = [], $page = null) + { + static $allowed_protocols; + static $pattern; + + /** @var Config $config */ + $config = Grav::instance()['config']; + + /** @var Page $page */ + $page = $page ?: Grav::instance()['page']; + + // Statically store allowed protocols + if (!isset($allowed_protocols)) { + $allowed_protocols = array_flip( + $config->get('plugins.external_links.links.schemes', ['http', 'https']) + ); } - // Get domains to be seen as internal - $domains = $options->get('exclude.domains', []); + // Statically store internal domains as a PCRE pattern. + if (!isset($pattern) || (count($domains) > 0)) { + $domains = array_merge($domains, + array(Grav::instance()['base_url_absolute'])); - // This is a mailto link. - if (strpos($href, 'mailto:') === 0) { - $classes[] = 'mailto'; - } - - // The link is external - elseif ($url = $this->isExternalUrl($href, $domains, $page)) { - // Add external class - $classes[] = 'external-link'; - $a->setAttribute('href', $url); - - // Add target="_blank" - $target = $options->get('target'); - if ($target) { - $a->setAttribute('target', $target); - } - - // Add no-follow. - $nofollow = $options->get('no_follow'); - if ($nofollow) { - $rel = array_filter(explode(' ', $a->getAttribute('rel'))); - if (!in_array('nofollow', $rel)) { - $rel[] = 'nofollow'; - $a->setAttribute('rel', implode(' ', $rel)); + foreach ($domains as $domain) { + $domains[] = preg_quote($domain, '#'); } - } - - // Add image class to if it has at least one child element - $imgs = $a->getElementsByTagName('img'); - if ($imgs->length > 1) { - // Add "images" class to element, if it has multiple child images - $classes[] = 'images'; - } elseif ($imgs->length == 1) { - $imgNode = $imgs->item(0); - - // Get image size - list($width, $height) = $this->getImageSize($imgNode); - - // Determine maximum dimension of image size - $size = max($width, $height); - - // Depending on size determine image type - $classes[] = ((0 < $size) && ($size <= 32)) ? 'icon' : 'image'; - } else { - // Add "no-image" class to element, if it has no child images - $classes[] = 'no-image'; - } - - // Add title (aka alert text) - if ($options->get('title')) { - $language = self::getGrav()['language']; - $message = $language->translate(['PLUGINS.EXTERNAL_LINKS.TITLE_MESSAGE']); - - // Set default title to link else, set title as data attribute - $key = $a->hasAttribute('title') ? 'data-title' : 'title'; - $a->setAttribute($key, $message); - } + $pattern = '#(' . str_replace(array('\*', '/*'), '.*?', + implode('|', $domains)) . ')#i'; } - // Set class attribute - if (count($classes) && ($options->get('mode') === 'active')) { - $a->setAttribute('class', implode(' ', $classes)); + $external = false; + // Check for URLs that don't match any excluded domain + if (!preg_match($pattern, $url)) { + // Check if URL is external by extracting colon position + $colonpos = strpos($url, ':'); + if ($colonpos > 0) { + // We found a colon, possibly a protocol. Verify. + $protocol = strtolower(substr($url, 0, $colonpos)); + if (isset($allowed_protocols[$protocol])) { + // The protocol turns out be an allowed protocol + $external = $url; + } + } elseif ($config->get('plugins.external_links.links.www')) { + // Remove possible path duplicate + $route = Grav::instance()['base_url'] . $page->route(); + $href = Utils::startsWith($url, $route) + ? ltrim(mb_substr($url, mb_strlen($route)), '/') + : $url; + + // We found an url without protocol, but with starting 'www' (sub-)domain + if (Utils::startsWith($url, 'www.')) { + $external = 'http://' . $url; + } elseif (Utils::startsWith($href, 'www.')) { + $external = 'http://' . $href; + } + } } - // Save Dom document back to HTML representation - $html = $this->saveDOMDocument($dom); - return $html; - }, $content); - - // Write content back to page - return $content; - } - - /** ------------------------------- - * Private/protected helper methods - * -------------------------------- - */ - - /** - * Test if a URL is external - * - * @param string $url The URL to test. - * @param array $domains An array of domains to be seen as internal. - * @param null|Page $page Null or an instance of \Grav\Common\Page. - * - * @return mixed Returns the URL as a string, if it is external, - * false otherwise. - */ - protected function isExternalUrl($url, $domains = [], $page = null) - { - static $allowed_protocols; - static $pattern; - - /** @var Config $config */ - $config = self::getGrav()['config']; - - /** @var Page $page */ - $page = $page ?: self::getGrav()['page']; - - // Statically store allowed protocols - if (!isset($allowed_protocols)) { - $allowed_protocols = array_flip( - $config->get('plugins.external_links.links.schemes', ['http', 'https']) - ); + // Only if a valid protocol or an URL starting with 'www.' was found return true + return $external; } - // Statically store internal domains as a PCRE pattern. - if (!isset($pattern) || (count($domains) > 0)) { - $domains = array_merge($domains, - array(self::getGrav()['base_url_absolute'])); + /** + * Determine the size of an image + * + * @param DOMNode $imgNode The image already parsed as a DOMNode + * @param integer $limit Load first $limit KB of remote image + * + * @return array Return the dimension of the image of the + * format array(width, height) + */ + protected function getImageSize($imgNode, $limit = 32) + { + // Hold units (assume standard font with 16px base pixel size) + // Calculations based on pixels + $units = array( + 'px' => 1, /* base unit: pixel */ + 'pt' => 16 / 12, /* 12 point = 16 pixel = 1/72 inch */ + 'pc' => 16, /* 1 pica = 16 pixel = 12 points */ + + 'in' => 96, /* 1 inch = 96 pixel = 2.54 centimeters */ + 'mm' => 96 / 25.4, /* 1 millimeter = 96 pixel / 1 inch [mm] */ + 'cm' => 96 / 2.54, /* 1 centimeter = 96 pixel / 1 inch [cm] */ + 'm' => 96 / 0.0254, /* 1 centimeter = 96 pixel / 1 inch [m] */ + + 'ex' => 7, /* 1 ex = 7 pixel */ + 'em' => 16, /* 1 em = 16 pixel */ + 'rem' => 16, /* 1 rem = 16 pixel */ + + '%' => 16 / 100, /* 100 percent = 16 pixel */ + ); + + // Initialize dimensions + $width = 0; + $height = 0; + + // Determine image dimensions based on "src" atrribute + if ($imgNode->hasAttribute('src')) { + $src = $imgNode->getAttribute('src'); + + // Simple check if the URL is internal i.e. check if path exists + $path = $_SERVER['DOCUMENT_ROOT'] . $src; + if (realpath($path) && is_file($path)) { + $size = @getimagesize($path); + } else { + // The URL is external; try to load it (default: 32 KB) + $size = $this->getRemoteImageSize($src, $limit * 1024); + } + } - foreach ($domains as $domain) { - $domains[] = preg_quote($domain, '#'); - } - $pattern = '#(' . str_replace(array('\*', '/*'), '.*?', - implode('|', $domains)) . ')#i'; - } + // Read out width and height from attributes + $width = $imgNode->hasAttribute('width') ? + $imgNode->getAttribute('width') : $size[0]; + $height = $imgNode->hasAttribute('height') ? + $imgNode->getAttribute('height') : $size[1]; + + // Get width and height from style attribute + if ( $imgNode->hasAttribute('style') ) { + $style = $imgNode->getAttribute('style'); + + // Width + if (preg_match('~width:\s*(\d+)([a-z]+)~i', $style, $matches)) { + $width = $matches[1]; + // Convert unit to pixel + if ( isset($units[$matches[2]]) ) { + $width *= $units[$matches[2]]; + } + } - $external = false; - // Check for URLs that don't match any excluded domain - if (!preg_match($pattern, $url)) { - // Check if URL is external by extracting colon position - $colonpos = strpos($url, ':'); - if ($colonpos > 0) { - // We found a colon, possibly a protocol. Verify. - $protocol = strtolower(substr($url, 0, $colonpos)); - if (isset($allowed_protocols[$protocol])) { - // The protocol turns out be an allowed protocol - $external = $url; - } - } elseif ($config->get('plugins.external_links.links.www')) { - // Remove possible path duplicate - $route = self::getGrav()['base_url'] . $page->route(); - $href = Utils::startsWith($url, $route) - ? ltrim(mb_substr($url, mb_strlen($route)), '/') - : $url; - - // We found an url without protocol, but with starting 'www' (sub-)domain - if (Utils::startsWith($url, 'www.')) { - $external = 'http://' . $url; - } elseif (Utils::startsWith($href, 'www.')) { - $external = 'http://' . $href; + // Height + if (preg_match('~height:\s*(\d+)([a-z]+)~i', $style, $matches)) { + $height = $matches[1]; + // Convert unit to pixel + if (isset($units[$matches[2]])) { + $height *= $units[$matches[2]]; + } + } } - } - } - // Only if a valid protocol or an URL starting with 'www.' was found return true - return $external; - } - - /** - * Determine the size of an image - * - * @param DOMNode $imgNode The image already parsed as a DOMNode - * @param integer $limit Load first $limit KB of remote image - * - * @return array Return the dimension of the image of the - * format array(width, height) - */ - protected function getImageSize($imgNode, $limit = 32) - { - // Hold units (assume standard font with 16px base pixel size) - // Calculations based on pixels - $units = array( - 'px' => 1, /* base unit: pixel */ - 'pt' => 16 / 12, /* 12 point = 16 pixel = 1/72 inch */ - 'pc' => 16, /* 1 pica = 16 pixel = 12 points */ - - 'in' => 96, /* 1 inch = 96 pixel = 2.54 centimeters */ - 'mm' => 96 / 25.4, /* 1 millimeter = 96 pixel / 1 inch [mm] */ - 'cm' => 96 / 2.54, /* 1 centimeter = 96 pixel / 1 inch [cm] */ - 'm' => 96 / 0.0254, /* 1 centimeter = 96 pixel / 1 inch [m] */ - - 'ex' => 7, /* 1 ex = 7 pixel */ - 'em' => 16, /* 1 em = 16 pixel */ - 'rem' => 16, /* 1 rem = 16 pixel */ - - '%' => 16 / 100, /* 100 percent = 16 pixel */ - ); - - // Initialize dimensions - $width = 0; - $height = 0; - - // Determine image dimensions based on "src" atrribute - if ($imgNode->hasAttribute('src')) { - $src = $imgNode->getAttribute('src'); - - // Simple check if the URL is internal i.e. check if path exists - $path = $_SERVER['DOCUMENT_ROOT'] . $src; - if (realpath($path) && is_file($path)) { - $size = @getimagesize($path); - } else { - // The URL is external; try to load it (default: 32 KB) - $size = $this->getRemoteImageSize($src, $limit * 1024); - } + // Update width and height + $size[0] = $width; + $size[1] = $height; + + // Return image dimensions + return $size; } - // Read out width and height from attributes - $width = $imgNode->hasAttribute('width') ? - $imgNode->getAttribute('width') : $size[0]; - $height = $imgNode->hasAttribute('height') ? - $imgNode->getAttribute('height') : $size[1]; - - // Get width and height from style attribute - if ( $imgNode->hasAttribute('style') ) { - $style = $imgNode->getAttribute('style'); - - // Width - if (preg_match('~width:\s*(\d+)([a-z]+)~i', $style, $matches)) { - $width = $matches[1]; - // Convert unit to pixel - if ( isset($units[$matches[2]]) ) { - $width *= $units[$matches[2]]; + /** + * Get the size of a remote image + * + * @param string $uri The URI of the remote image + * @param integer $limit Load first $limit bytes of remote image + * + * @return mixed Returns an array with up to 7 elements + */ + protected function getRemoteImageSize($uri, $limit = -1) + { + // Create temporary file to store data from $uri + $tmp_name = tempnam(sys_get_temp_dir(), uniqid('ris')); + if ($tmp_name === false) { + return false; } - } - - // Height - if (preg_match('~height:\s*(\d+)([a-z]+)~i', $style, $matches)) { - $height = $matches[1]; - // Convert unit to pixel - if (isset($units[$matches[2]])) { - $height *= $units[$matches[2]]; + + // Open temporary file + $tmp = fopen($tmp_name, 'rb'); + + // Check which method we should use to get remote image sizes + $allow_url_fopen = ini_get('allow_url_fopen') ? true : false; + $use_curl = function_exists('curl_version'); + + // Use stream copy + if ($allow_url_fopen) { + $options = []; + if ( $limit > 0 ) { + // Loading number of $limit bytes + $options['http']['header'] = array('Range: bytes=0-' . $limit); + } + + // Create stream context + $context = stream_context_create($options); + @copy($uri, $tmp_name, $context); + + // Use Curl + } elseif ($use_curl) { + // Initialize Curl + $options = array( + CURLOPT_HEADER => false, // Don't return headers + CURLOPT_FOLLOWLOCATION => true, // Follow redirects + CURLOPT_AUT||EFERER => true, // Set referrer on redirect + CURLOPT_CONNECTTIMEOUT => 120, // Timeout on connect + CURLOPT_TIMEOUT => 120, // Timeout on response + CURLOPT_MAXREDIRS => 10, // Stop after 10 redirects + CURLOPT_ENCODING => '', // Handle all encodings + CURLOPT_BINARYTRANSFER => true, // Transfer as binary file + CURLOPT_FILE => $tmp, // Curl file + CURLOPT_URL => $uri, // URI + ); + + $curl = curl_init(); + curl_setopt_array($curl, $options); + + if ( $limit > 0 ) { + // Loading number of $limit + $headers = array('Range: bytes=0-' . $limit); + curl_setopt($curl, CURLOPT_HTTPHEADER, $headers); + curl_setopt($curl, CURLOPT_RANGE, '0-' . $limit); + + // Abort request when more data is received + curl_setopt($curl, CURLOPT_BUFFERSIZE, 512); // More progress info + curl_setopt($curl, CURLOPT_NOPROGRESS, false); // Monitor progress + curl_setopt($curl, CURLOPT_PROGRESSFUNCTION, + function($download_size, $downloaded, $upload_size, $uploaded) use ($limit) { + // If $downloaded exceeds $limit, returning non-zero breaks + // the connection! + return ( $downloaded > $limit ) ? 1 : 0; + }); + } + + // Execute Curl + curl_exec($curl); + curl_close($curl); } - } - } - // Update width and height - $size[0] = $width; - $size[1] = $height; - - // Return image dimensions - return $size; - } - - /** - * Get the size of a remote image - * - * @param string $uri The URI of the remote image - * @param integer $limit Load first $limit bytes of remote image - * - * @return mixed Returns an array with up to 7 elements - */ - protected function getRemoteImageSize($uri, $limit = -1) - { - // Create temporary file to store data from $uri - $tmp_name = tempnam(sys_get_temp_dir(), uniqid('ris')); - if ($tmp_name === false) { - return false; - } + // Close temporary file + fclose($tmp); - // Open temporary file - $tmp = fopen($tmp_name, 'rb'); - - // Check which method we should use to get remote image sizes - $allow_url_fopen = ini_get('allow_url_fopen') ? true : false; - $use_curl = function_exists('curl_version'); - - // Use stream copy - if ($allow_url_fopen) { - $options = []; - if ( $limit > 0 ) { - // Loading number of $limit bytes - $options['http']['header'] = array('Range: bytes=0-' . $limit); - } - - // Create stream context - $context = stream_context_create($options); - @copy($uri, $tmp_name, $context); - - // Use Curl - } elseif ($use_curl) { - // Initialize Curl - $options = array( - CURLOPT_HEADER => false, // Don't return headers - CURLOPT_FOLLOWLOCATION => true, // Follow redirects - CURLOPT_AUT||EFERER => true, // Set referrer on redirect - CURLOPT_CONNECTTIMEOUT => 120, // Timeout on connect - CURLOPT_TIMEOUT => 120, // Timeout on response - CURLOPT_MAXREDIRS => 10, // Stop after 10 redirects - CURLOPT_ENCODING => '', // Handle all encodings - CURLOPT_BINARYTRANSFER => true, // Transfer as binary file - CURLOPT_FILE => $tmp, // Curl file - CURLOPT_URL => $uri, // URI - ); - - $curl = curl_init(); - curl_setopt_array($curl, $options); - - if ( $limit > 0 ) { - // Loading number of $limit - $headers = array('Range: bytes=0-' . $limit); - curl_setopt($curl, CURLOPT_HTTPHEADER, $headers); - curl_setopt($curl, CURLOPT_RANGE, '0-' . $limit); - - // Abort request when more data is received - curl_setopt($curl, CURLOPT_BUFFERSIZE, 512); // More progress info - curl_setopt($curl, CURLOPT_NOPROGRESS, false); // Monitor progress - curl_setopt($curl, CURLOPT_PROGRESSFUNCTION, - function($download_size, $downloaded, $upload_size, $uploaded) use ($limit) { - // If $downloaded exceeds $limit, returning non-zero breaks - // the connection! - return ( $downloaded > $limit ) ? 1 : 0; - }); - } - - // Execute Curl - curl_exec($curl); - curl_close($curl); - } + // Retrieve image information + $info = array(0, 0, 'width="0" height="0"'); + if (filesize($tmp_name) > 0) { + $info = @getimagesize($tmp_name); + } - // Close temporary file - fclose($tmp); + // Delete temporary file + unlink($tmp_name); - // Retrieve image information - $info = array(0, 0, 'width="0" height="0"'); - if (filesize($tmp_name) > 0) { - $info = @getimagesize($tmp_name); + return $info; } - // Delete temporary file - unlink($tmp_name); - - return $info; - } - - /** - * Load contents into PHP built-in DOMDocument object - * - * Two Really good resources to handle DOMDocument with HTML(5) - * correctly. - * - * @see http://stackoverflow.com/questions/3577641/how-do-you-parse-and-process-html-xml-in-php - * @see http://stackoverflow.com/questions/7997936/how-do-you-format-dom-structures-in-php - * - * @param string $content The content to be loaded into the - * DOMDocument object - * - * @return DOMDocument DOMDocument object of content - */ - protected function loadDOMDocument($content) - { - // Clear previous errors - if (libxml_use_internal_errors(true) === true) { - libxml_clear_errors(); - } + /** + * Load contents into PHP built-in DOMDocument object + * + * Two Really good resources to handle DOMDocument with HTML(5) + * correctly. + * + * @see http://stackoverflow.com/questions/3577641/how-do-you-parse-and-process-html-xml-in-php + * @see http://stackoverflow.com/questions/7997936/how-do-you-format-dom-structures-in-php + * + * @param string $content The content to be loaded into the + * DOMDocument object + * + * @return DOMDocument DOMDocument object of content + */ + protected function loadDOMDocument($content) + { + // Clear previous errors + if (libxml_use_internal_errors(true) === true) { + libxml_clear_errors(); + } - // Parse content using PHP built-in DOMDocument class - $document = new \DOMDocument('1.0', 'UTF-8'); + // Parse content using PHP built-in DOMDocument class + $document = new \DOMDocument('1.0', 'UTF-8'); - // Encode contents as UTF-8, strip whitespaces & normalize newlines - $content = mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8'); + // Encode contents as UTF-8, strip whitespaces & normalize newlines + $content = mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8'); - // $whitespaces = array( - // '~\R~u' => "\n", // Normalize new line - // '~\>[^\S ]+~s' => '>', // Strip whitespaces after tags, except space - // '~[^\S ]+\<~s' => '<', // Strip whitespaces before tags, except space - // '~(\s)+~s' => '\\1' // Shorten multiple whitespace sequences - // ); - // $content = preg_replace(array_keys($whitespaces), $whitespaces, $content); + // $whitespaces = array( + // '~\R~u' => "\n", // Normalize new line + // '~\>[^\S ]+~s' => '>', // Strip whitespaces after tags, except space + // '~[^\S ]+\<~s' => '<', // Strip whitespaces before tags, except space + // '~(\s)+~s' => '\\1' // Shorten multiple whitespace sequences + // ); + // $content = preg_replace(array_keys($whitespaces), $whitespaces, $content); - // Parse the HTML using UTF-8 - // The @ before the method call suppresses any warnings that - // loadHTML might throw because of invalid HTML in the page. - @$document->loadHTML($content); + // Parse the HTML using UTF-8 + // The @ before the method call suppresses any warnings that + // loadHTML might throw because of invalid HTML in the page. + @$document->loadHTML($content); - // Do nothing, if DOM is empty - if (is_null($document->documentElement)) { - return null; - } + // Do nothing, if DOM is empty + if (is_null($document->documentElement)) { + return null; + } - return $document; - } - - /** - * Save contents of PHP built-in DOMDocument object as HTML5 - * - * @param DOMDocument $document DOMDocument object with nodes - * - * @return string The outputted DOM document as HTML(5) - * compliant string - */ - protected function saveDOMDocument($document) - { - // Pretty print output - $document->preserveWhiteSpace = false; - $document->formatOutput = true; - - // Transform DOM document to valid HTML(5) - $content = ''; - $body = $document->getElementsByTagName('body')->item(0); - foreach ($body->childNodes as $node) { - // Expand empty tags (e.g.
to

) - if (($html = $document->saveXML($node, LIBXML_NOEMPTYTAG)) !== false) { - $content .= $html; - } + return $document; } - // Fix formatting for self-closing tags in HTML5 and removing - // encapsulated (uncommented) CDATA blocks in