File: includes/htmlSaveComplete.php

Recommend this page to a friend!
???
File:	`includes/???`
Role:	Class source
Content type:	`text/plain`
Description:	htmlSaveComplete.php
Class:	PHP Save Complete HTML Page Save HTML pages complete with images, CSS and JS
Author:	By Sarfraz Ahmed
Last change:	it is now able to fetch pages from secure urls
Date:	11 years ago
Size:	`18,562 bytes`
Download
<?php
/**
 * The htmlSaveComplete class can be used to save specified URLs completely in single file
 * by converting images to data URIs and extracting all CSS.
 *
 * Author: Sarfraz Ahmed
 * http://sarfraznawaz.wordpress.com
 *
 *
 * NOTE: If you are using this class, please keep above author information intact. Thanks
 */

/* Issues
 ---------------------------------------------------------
Does not convert images from @import URLs to data URIs, removed this (eg not using getImportStyles function)
due to performance issue.
*/

set_time_limit(300); // no more than 5 minutes !!!

require_once 'includes/content_extractor.php'; # class to extract "main" content part of given page
require_once 'includes/url_to_absolute.php'; # or implement your own function to convert relative URLs to absolute

class htmlSaveComplete
{

    # set debug mode on or off with 1, 0 or true, false respectively
    const DEBUG = 0;

    # holds debug data
    private $debugOutput = array();
    # url to save complete page from
    private $url = '';
    # set user agent mode on or off with 1, 0 or true, false respectively
    private $useUserAgent = FALSE;
    # holds parsed html
    private $html = '';
    # holds DOM object
    private $dom = '';

    /**
     *  The constructor function. Allows setting url to save data from and whether to use user agent.
     *
     * @param $url - url to save complete page from.
     * @param bool $useUserAgent - whether to use user agent to get page data
     *
     * @throws exception - throws an exception if provided url isn't in proper format
     */
    public function __construct($url, $useUserAgent = FALSE)
    {
        # see if Data URIs are supported by the browser
        $isDatauriSupported = preg_match('#(Opera|Gecko|MSIE 8)#', $_SERVER['HTTP_USER_AGENT']);

        if (! $isDatauriSupported)
        {
            throw new Exception('Your Browser does not support Data URIs');
        }

        # validate the URL
        if (! filter_var($url, FILTER_VALIDATE_URL))
        {
            throw new Exception('Invalid URL. Make sure to specify http(s) part.');
        }

        $this->url = $url;
        $this->useUserAgent = $useUserAgent;

        # suppress DOM parsing errors
        libxml_use_internal_errors(TRUE);

        $this->dom = new DOMDocument();
        $this->dom->preserveWhiteSpace = FALSE;
        # avoid strict error checking
        $this->dom->strictErrorChecking = FALSE;
    }

    /**
     * Gets complete page data and returns generated string
     *
     * @param bool $keepjs - whether to keep javascript
     * @param bool $contentOnly - whether to extract main content part of the page only
     * @param bool $compress - whether to remove extra whitespac
     *
     * @return string|void
     */
    public function getCompletePage($keepjs = FALSE, $contentOnly = FALSE, $compress = FALSE)
    {
        $scriptBuffer = '';
        $cssBuffer = '<style>';

        if ($this->useUserAgent)
        {
            $this->html = $this->getUrlContents($this->url);
        }
        else
        {
            $this->html = file_get_contents($this->url);
        }

        # get document stylesheets
        $stylesheets = $this->getStyleSheets();

        foreach ($stylesheets as $stylesheet)
        {
            $cssBuffer .= $this->getContents($this->getFullUrl($stylesheet)) . "\r\n\r\n";
        }

        # get @import URLS and merge it in CSS
//      $importURLs = $this->getImportStyles();
//
//      foreach ( $importURLs as $importURL )
//      {
//          $cssBuffer .= $this->getContents( $this->getFullUrl($importURL) ) . "\r\n\r\n";
//      }

        $cssBuffer .= '</style>' . "\r\n\r\n";

        # get document scripts
        if ($keepjs)
        {
            $scriptBuffer .= '<script>';
            $scripts = $this->getScripts();

            foreach ($scripts as $script)
            {
                $scriptBuffer .= $this->getContents($this->getFullUrl($script)) . "\r\n\r\n";
            }

            $scriptBuffer .= '</script>' . "\r\n\r\n";
        }

        # convert URLs from CSS styles to data URIs
        $cssBuffer = $this->toDataUri($cssBuffer);

        # remove useless stuff such as <link>, <meta> and <script> tags
        $this->removeUseless($keepjs);

        # convert URLs from @import styles to data URIs
        $this->html = $this->toDataUri($this->html);

        # see if we need to extract main content part
        if ($contentOnly)
        {
            $extractor = new ContentExtractor();
            $this->html = $extractor->extract($this->html);
        }

        # convert <img> tags to data URIs
        $this->convertImageToDataUri();

        # convert all relative links for <a> tags to absolute
        $this->toAbsoluteURLs();

        # finally join the css and html and inssert information header
        if (strlen($this->html) > 300)
        { # we did get some html back
            $this->html = $cssBuffer . $scriptBuffer . $this->html;
            $this->insertHeader();

            if (self::DEBUG)
            {
                $this->showDebugInfo();
                exit();
            }

            if ($compress)
            {
                return $this->compress($this->html);
            }
            else
            {
                return $this->html;
            }
        }
        else
        {
            return '';
        }
    }

    /**
     * Converts images to data URIs
     */
    private function convertImageToDataUri()
    {
        $tags = $this->getTags('//img');
        $tagsLength = $tags->length;

        # loop over all <img> tags and convert them to data uri
        for ($i = 0; $i < $tagsLength; $i ++)
        {
            $tag = $tags->item($i);
            $src = $this->getFullUrl($tag->getAttribute('src'));

            if ($this->remote_file_exists($src))
            {
                $dataUri = $this->imageToDataUri($src);
                $tag->setAttribute('src', $dataUri);
            }
        }

        # now save html with converted images
        $this->html = $this->dom->saveHTML();
    }

    /**
     * Returns tags list for specified selector
     *
     * @param $selector - xpath selector expression
     *
     * @return DOMNodeList
     */
    private function getTags($selector)
    {
        $this->dom->loadHTML($this->html);
        $xpath = new DOMXpath($this->dom);
        $tags = $xpath->query($selector);

        # free memory
        libxml_use_internal_errors(FALSE);
        libxml_use_internal_errors(TRUE);
        libxml_clear_errors();
        unset($xpath);
        $xpath = NULL;

        return $tags;
    }

    /**
     * Converts URLs with format url(....) to data URIs
     *
     * @param $html
     *
     * @return mixed
     */
    private function toDataUri($html)
    {
        # convert css URLs to data URIs
        $html = preg_replace_callback('#(url\([\'\"]?)([^\"\'\)]+)([\"\']?\))#', array($this, 'createDataUri'), $html);

        return $html;
    }

    /**
     * Inserts htmlSaveComplete information header on very start of page
     */
    private function insertHeader()
    {
        $header = '<!-- This page was saved with htmlSaveComplete (http://sarfraznawaz.wordpress.com) -->' . "\r\n\r\n";
        $this->html = $header . $this->html;
    }

    /**
     * Checks whether or not remote file exists
     *
     * @param $url
     *
     * @return bool
     */
    private function remote_file_exists($url)
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        # don't download content
        curl_setopt($ch, CURLOPT_NOBODY, 1);
        curl_setopt($ch, CURLOPT_FAILONERROR, 1);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

        if (curl_exec($ch) !== FALSE)
        {
            return TRUE;
        }

        return FALSE;
    }

    /**
     * Converts images from <img> tags to data URIs
     *
     * @param $path - image path eg src value
     *
     * @return string - generated data uri
     */
    private function imageToDataUri($path)
    {
        $fileType = trim(strtolower(pathinfo($path, PATHINFO_EXTENSION)));
        $mimType = $fileType;

        # since jpg/jpeg images have image/jpeg mime-type
        if (! $fileType || $fileType === 'jpg')
        {
            $mimType = 'jpeg';
        }
        else
        {
            if ($fileType === 'ico')
            {
                $mimType = 'x-icon';
            }
        }

        # make sure that it is an image and convert to data uri
        if (preg_match('#^(gif|png|jp[e]?g|bmp)$#i', $fileType) || $this->isImage($path))
        {
            # in case of images from gravatar, etc
            if ($mimType === 'php' || stripos($mimType, 'php') !== FALSE)
            {
                $mimType = 'jpeg';
            }

            $data = $this->getContents($path);
            $base64 = 'data:image/' . $mimType . ';base64,' . base64_encode($data);

            return $base64;
        }
    }

    /**
     * Removes <link>, <meta> and <script> tags from generated page
     */
    private function removeUseless($keepjs)
    {
        # remove empty lines
        //$this->html = preg_replace('#(\r\n[ \t]*){2,}#', "\r\n", $this->html);

        # remove @import declarations
        //preg_replace('#(@import url\([\'\"]?)([^\"\'\)]+)([\"\']?\))#', '', $this->html);

        # fix showing up of garbage characters
        $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');

        $tags = $this->getTags('//meta | //link | //script');

        $tagsLength = $tags->length;

        # get all <link>, <meta> and <script> tags and remove them
        for ($i = 0; $i < $tagsLength; $i ++)
        {
            $tag = $tags->item($i);

            # delete only external scripts
            if (strtolower($tag->nodeName) === 'script')
            {
                if ($keepjs)
                {
                    if ($tag->getAttribute('src') !== '')
                    {
                        $tag->parentNode->removeChild($tag);
                    }
                }
                else
                {
                    $tag->parentNode->removeChild($tag);
                }
            }
            elseif (strtolower($tag->nodeName) === 'meta')
            {
                # keep the charset meta
                if (stripos($tag->getAttribute('content'), 'charset') === FALSE)
                {
                    $tag->parentNode->removeChild($tag);
                }
            }
            else
            {
                $tag->parentNode->removeChild($tag);
            }
        }

        $this->html = $this->dom->saveHTML();
    }

    /**
     * Gets all external stylesheets of the page
     *
     * @return array
     */
    private function getStyleSheets()
    {
        $styleSheets = array();
        $links = $this->getTags('//link[contains(@rel, "stylesheet")]');

        foreach ($links as $link)
        {
            if (self::DEBUG)
            {
                $this->debugStore(array('stylesheet' => $link->getAttribute('href')));
            }

            array_push($styleSheets, $link->getAttribute('href'));
        }

        return $styleSheets;
    }

    /**
     * Gets all external scripts of the page.
     *
     * @return array
     */
    private function getScripts()
    {
        $scripts = array();
        $links = $this->getTags('//script[contains(@src, "")]');

        foreach ($links as $link)
        {
            //$src = preg_replace('#\?.*#', '', $link->getAttribute('src'));
            $src = $link->getAttribute('src');

            if (strpos($src, '.js') !== FALSE)
            {
                if (self::DEBUG)
                {
                    $this->debugStore(array('script' => $src));
                }

                array_push($scripts, $src);
            }
        }

        return $scripts;
    }

    /**
     * Gets all
     *
     * @import URLs
     *
     * @return array
     */
    private function getImportStyles()
    {
        $importURLs = array();
        $styles = $this->getTags('//style');

        foreach ($styles as $style)
        {
            $content = $style->textContent;
            preg_match_all('#(@import url\([\'\"]?)([^\"\'\)]+)([\"\']?\))#', $content, $matches);

            if (isset($matches[2]) && count($matches[2]))
            {
                foreach ($matches[2] as $importUrl)
                {
                    $importURLs[] = $importUrl;
                }
            }
        }

        return $importURLs;
    }

    /**
     * Converts relative <a> tag paths to absolute paths
     */
    private function toAbsoluteURLs()
    {
        $links = $this->getTags('//a');

        foreach ($links as $link)
        {
            $link->setAttribute('href', $this->getFullUrl($link->getAttribute('href')));
        }

        $this->html = $this->dom->saveHTML();
    }

    /**
     * Compresses generated page by removing extra whitespace
     */
    private function compress($string)
    {
        # remove whitespace
        return str_replace(array("\r\n", "\r", "\n", "\t", '  ', '    ', '    '), '', $string);
    }

    /**
     * Gets content for given url
     *
     * @param $url
     *
     * @return string
     */
    private function getContents($url)
    {
        $data = @file_get_contents($url);

        if ($data)
        {
            return $data;
        }

        return @file_get_contents(trim($url));
    }

    /**
     * Converts matched URLs to data URIs
     *
     * @param $matches
     *
     * @return string
     */
    private function createDataUri($matches)
    {
        $fileType = explode('.', $matches[2]);
        $fileType = trim(strtolower($fileType[count($fileType) - 1]));

        # replace ?whatever=value from extensions
        $fileType = preg_replace('#\?.*#', '', $fileType);
        $mimeType = $fileType;

        # since jpg/jpeg images have image/jpeg mime-type
        if ($fileType === 'jpg')
        {
            $mimeType = 'jpeg';
        }
        else
        {
            if ($fileType === 'ico')
            {
                $mimeType = 'x-icon';
            }
            else
            {
                if ($fileType === 'css')
                {
                    $mimeType = 'css';
                }
            }
        }

        $datauri = $this->getFullUrl($matches[2]);

        #if the file is an image from CSS URLs convert it to data uri
        if (preg_match('#^(gif|png|jp[e]?g|bmp|css)$#i', $fileType))
        {
            if (self::DEBUG)
            {
                $this->debugStore(array('datauri' => $datauri));
            }

            $data = $this->getContents($datauri);

            if (! $data)
            {
                # return whatever there was originally
                return $matches[0];
            }

            $data = base64_encode($data);

            $mime = $fileType === 'css' ? 'text' : 'image';

            return $matches[1] . 'data:' . $mime . '/' . $mimeType . ';base64,' . $data . $matches[3];
        }
        else
        {
            # return whatever there was originally
            return $matches[0];
        }

    }

    /**
     * Gets content for given url using curl and optionally using user agent
     *
     * @param $url
     * @param int $timeout
     * @param string $userAgent
     *
     * @return int|mixed
     */
    private function getUrlContents(
        $url,
        $timeout = 0,
        $userAgent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.215 Safari/534.10'
    ) {
        $rawHtml = curl_init();
        curl_setopt($rawHtml, CURLOPT_URL, $url);
        curl_setopt($rawHtml, CURLOPT_RETURNTRANSFER, 1); # return result as string rather than direct output
        curl_setopt($rawHtml, CURLOPT_CONNECTTIMEOUT, $timeout); # set the timeout
        curl_setopt($rawHtml, CURLOPT_USERAGENT, $userAgent); # set our 'user agent'

        curl_setopt($rawHtml, CURLOPT_SSL_VERIFYPEER, false);

        $output = curl_exec($rawHtml);
        curl_close($rawHtml);

        if (! $output)
        {
            return - 1;
        }

        return $output;
    }

    /**
     * Converts relative URLs to absolute URLs
     *
     * @param $url
     *
     * @return bool|string
     */
    private function getFullUrl($url)
    {
        if (strpos($url, '//') === FALSE)
        {
            return url_to_absolute($this->url, $url);
        }

        return $url;
    }

    /**
     * Checks if provided path is an image
     *
     * @param $path
     *
     * @return bool
     */
    private function isImage($path)
    {
        list($width) = @getimagesize($path);

        if (isset($width) && $width)
        {
            return TRUE;
        }

        return FALSE;
    }

    /**
     * Stores debug information
     *
     * @param $data
     */
    private function debugStore($data)
    {
        if (self::DEBUG)
        {
            if (is_array($data))
            {
                foreach ($data as $key => $value)
                {
                    if (isset($this->debugOutput[$key]))
                    {
                        $this->debugOutput[$key . count($this->debugOutput)] = $value;
                    }
                    else
                    {
                        $this->debugOutput[$key] = $value;
                    }
                }
            }
            else
            {
                $this->debugOutput[] = $data;
            }
        }
    }

    /**
     * Shows debug information
     */
    private function showDebugInfo()
    {
        echo '<pre>';
        ksort($this->debugOutput);
        print_r($this->debugOutput);
        echo '</pre>';
    }

}
About us
Advertise on this site
For more information send a message to info at phpclasses dot org.