www.aleph1.co.uk Git - yaffs-website/blob - web/core/lib/Drupal/Component/Utility/Html.php

   1 <?php
   2
   3 namespace Drupal\Component\Utility;
   4
   5 /**
   6  * Provides DOMDocument helpers for parsing and serializing HTML strings.
   7  *
   8  * @ingroup utility
   9  */
  10 class Html {
  11
  12   /**
  13    * An array of previously cleaned HTML classes.
  14    *
  15    * @var array
  16    */
  17   protected static $classes = [];
  18
  19   /**
  20    * An array of the initial IDs used in one request.
  21    *
  22    * @var array
  23    */
  24   protected static $seenIdsInit;
  25
  26   /**
  27    * An array of IDs, including incremented versions when an ID is duplicated.
  28    * @var array
  29    */
  30   protected static $seenIds;
  31
  32   /**
  33    * Stores whether the current request was sent via AJAX.
  34    *
  35    * @var bool
  36    */
  37   protected static $isAjax = FALSE;
  38
  39   /**
  40    * All attributes that may contain URIs.
  41    *
  42    * - The attributes 'code' and 'codebase' are omitted, because they only exist
  43    *   for the <applet> tag. The time of Java applets has passed.
  44    * - The attribute 'icon' is omitted, because no browser implements the
  45    *   <command> tag anymore.
  46    *  See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command.
  47    * - The 'manifest' attribute is omitted because it only exists for the <html>
  48    *   tag. That tag only makes sense in a HTML-served-as-HTML context, in which
  49    *   case relative URLs are guaranteed to work.
  50    *
  51    * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
  52    * @see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
  53    *
  54    * @var string[]
  55    */
  56   protected static $uriAttributes = ['href', 'poster', 'src', 'cite', 'data', 'action', 'formaction', 'srcset', 'about'];
  57
  58   /**
  59    * Prepares a string for use as a valid class name.
  60    *
  61    * Do not pass one string containing multiple classes as they will be
  62    * incorrectly concatenated with dashes, i.e. "one two" will become "one-two".
  63    *
  64    * @param mixed $class
  65    *   The class name to clean. It can be a string or anything that can be cast
  66    *   to string.
  67    *
  68    * @return string
  69    *   The cleaned class name.
  70    */
  71   public static function getClass($class) {
  72     $class = (string) $class;
  73     if (!isset(static::$classes[$class])) {
  74       static::$classes[$class] = static::cleanCssIdentifier(mb_strtolower($class));
  75     }
  76     return static::$classes[$class];
  77   }
  78
  79   /**
  80    * Prepares a string for use as a CSS identifier (element, class, or ID name).
  81    *
  82    * Link below shows the syntax for valid CSS identifiers (including element
  83    * names, classes, and IDs in selectors).
  84    *
  85    * @see http://www.w3.org/TR/CSS21/syndata.html#characters
  86    *
  87    * @param string $identifier
  88    *   The identifier to clean.
  89    * @param array $filter
  90    *   An array of string replacements to use on the identifier.
  91    *
  92    * @return string
  93    *   The cleaned identifier.
  94    */
  95   public static function cleanCssIdentifier($identifier, array $filter = [
  96     ' ' => '-',
  97     '_' => '-',
  98     '/' => '-',
  99     '[' => '-',
 100     ']' => '',
 101   ]) {
 102     // We could also use strtr() here but its much slower than str_replace(). In
 103     // order to keep '__' to stay '__' we first replace it with a different
 104     // placeholder after checking that it is not defined as a filter.
 105     $double_underscore_replacements = 0;
 106     if (!isset($filter['__'])) {
 107       $identifier = str_replace('__', '##', $identifier, $double_underscore_replacements);
 108     }
 109     $identifier = str_replace(array_keys($filter), array_values($filter), $identifier);
 110     // Replace temporary placeholder '##' with '__' only if the original
 111     // $identifier contained '__'.
 112     if ($double_underscore_replacements > 0) {
 113       $identifier = str_replace('##', '__', $identifier);
 114     }
 115
 116     // Valid characters in a CSS identifier are:
 117     // - the hyphen (U+002D)
 118     // - a-z (U+0030 - U+0039)
 119     // - A-Z (U+0041 - U+005A)
 120     // - the underscore (U+005F)
 121     // - 0-9 (U+0061 - U+007A)
 122     // - ISO 10646 characters U+00A1 and higher
 123     // We strip out any character not in the above list.
 124     $identifier = preg_replace('/[^\x{002D}\x{0030}-\x{0039}\x{0041}-\x{005A}\x{005F}\x{0061}-\x{007A}\x{00A1}-\x{FFFF}]/u', '', $identifier);
 125     // Identifiers cannot start with a digit, two hyphens, or a hyphen followed by a digit.
 126     $identifier = preg_replace([
 127       '/^[0-9]/',
 128       '/^(-[0-9])|^(--)/',
 129     ], ['_', '__'], $identifier);
 130     return $identifier;
 131   }
 132
 133   /**
 134    * Sets if this request is an Ajax request.
 135    *
 136    * @param bool $is_ajax
 137    *   TRUE if this request is an Ajax request, FALSE otherwise.
 138    */
 139   public static function setIsAjax($is_ajax) {
 140     static::$isAjax = $is_ajax;
 141   }
 142
 143   /**
 144    * Prepares a string for use as a valid HTML ID and guarantees uniqueness.
 145    *
 146    * This function ensures that each passed HTML ID value only exists once on
 147    * the page. By tracking the already returned ids, this function enables
 148    * forms, blocks, and other content to be output multiple times on the same
 149    * page, without breaking (X)HTML validation.
 150    *
 151    * For already existing IDs, a counter is appended to the ID string.
 152    * Therefore, JavaScript and CSS code should not rely on any value that was
 153    * generated by this function and instead should rely on manually added CSS
 154    * classes or similarly reliable constructs.
 155    *
 156    * Two consecutive hyphens separate the counter from the original ID. To
 157    * manage uniqueness across multiple Ajax requests on the same page, Ajax
 158    * requests POST an array of all IDs currently present on the page, which are
 159    * used to prime this function's cache upon first invocation.
 160    *
 161    * To allow reverse-parsing of IDs submitted via Ajax, any multiple
 162    * consecutive hyphens in the originally passed $id are replaced with a
 163    * single hyphen.
 164    *
 165    * @param string $id
 166    *   The ID to clean.
 167    *
 168    * @return string
 169    *   The cleaned ID.
 170    */
 171   public static function getUniqueId($id) {
 172     // If this is an Ajax request, then content returned by this page request
 173     // will be merged with content already on the base page. The HTML IDs must
 174     // be unique for the fully merged content. Therefore use unique IDs.
 175     if (static::$isAjax) {
 176       return static::getId($id) . '--' . Crypt::randomBytesBase64(8);
 177     }
 178
 179     // @todo Remove all that code once we switch over to random IDs only,
 180     // see https://www.drupal.org/node/1090592.
 181     if (!isset(static::$seenIdsInit)) {
 182       static::$seenIdsInit = [];
 183     }
 184     if (!isset(static::$seenIds)) {
 185       static::$seenIds = static::$seenIdsInit;
 186     }
 187
 188     $id = static::getId($id);
 189
 190     // Ensure IDs are unique by appending a counter after the first occurrence.
 191     // The counter needs to be appended with a delimiter that does not exist in
 192     // the base ID. Requiring a unique delimiter helps ensure that we really do
 193     // return unique IDs and also helps us re-create the $seen_ids array during
 194     // Ajax requests.
 195     if (isset(static::$seenIds[$id])) {
 196       $id = $id . '--' . ++static::$seenIds[$id];
 197     }
 198     else {
 199       static::$seenIds[$id] = 1;
 200     }
 201     return $id;
 202   }
 203
 204   /**
 205    * Prepares a string for use as a valid HTML ID.
 206    *
 207    * Only use this function when you want to intentionally skip the uniqueness
 208    * guarantee of self::getUniqueId().
 209    *
 210    * @param string $id
 211    *   The ID to clean.
 212    *
 213    * @return string
 214    *   The cleaned ID.
 215    *
 216    * @see self::getUniqueId()
 217    */
 218   public static function getId($id) {
 219     $id = str_replace([' ', '_', '[', ']'], ['-', '-', '-', ''], mb_strtolower($id));
 220
 221     // As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can
 222     // only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"),
 223     // colons (":"), and periods ("."). We strip out any character not in that
 224     // list. Note that the CSS spec doesn't allow colons or periods in identifiers
 225     // (http://www.w3.org/TR/CSS21/syndata.html#characters), so we strip those two
 226     // characters as well.
 227     $id = preg_replace('/[^A-Za-z0-9\-_]/', '', $id);
 228
 229     // Removing multiple consecutive hyphens.
 230     $id = preg_replace('/\-+/', '-', $id);
 231     return $id;
 232   }
 233
 234   /**
 235    * Resets the list of seen IDs.
 236    */
 237   public static function resetSeenIds() {
 238     static::$seenIds = NULL;
 239   }
 240
 241   /**
 242    * Normalizes an HTML snippet.
 243    *
 244    * This function is essentially \DOMDocument::normalizeDocument(), but
 245    * operates on an HTML string instead of a \DOMDocument.
 246    *
 247    * @param string $html
 248    *   The HTML string to normalize.
 249    *
 250    * @return string
 251    *   The normalized HTML string.
 252    */
 253   public static function normalize($html) {
 254     $document = static::load($html);
 255     return static::serialize($document);
 256   }
 257
 258   /**
 259    * Parses an HTML snippet and returns it as a DOM object.
 260    *
 261    * This function loads the body part of a partial (X)HTML document and returns
 262    * a full \DOMDocument object that represents this document.
 263    *
 264    * Use \Drupal\Component\Utility\Html::serialize() to serialize this
 265    * \DOMDocument back to a string.
 266    *
 267    * @param string $html
 268    *   The partial (X)HTML snippet to load. Invalid markup will be corrected on
 269    *   import.
 270    *
 271    * @return \DOMDocument
 272    *   A \DOMDocument that represents the loaded (X)HTML snippet.
 273    */
 274   public static function load($html) {
 275     $document = <<<EOD
 276 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 277 <html xmlns="http://www.w3.org/1999/xhtml">
 278 <head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>
 279 <body>!html</body>
 280 </html>
 281 EOD;
 282     // PHP's \DOMDocument serialization adds extra whitespace when the markup
 283     // of the wrapping document contains newlines, so ensure we remove all
 284     // newlines before injecting the actual HTML body to be processed.
 285     $document = strtr($document, ["\n" => '', '!html' => $html]);
 286
 287     $dom = new \DOMDocument();
 288     // Ignore warnings during HTML soup loading.
 289     @$dom->loadHTML($document);
 290
 291     return $dom;
 292   }
 293
 294   /**
 295    * Converts the body of a \DOMDocument back to an HTML snippet.
 296    *
 297    * The function serializes the body part of a \DOMDocument back to an (X)HTML
 298    * snippet. The resulting (X)HTML snippet will be properly formatted to be
 299    * compatible with HTML user agents.
 300    *
 301    * @param \DOMDocument $document
 302    *   A \DOMDocument object to serialize, only the tags below the first <body>
 303    *   node will be converted.
 304    *
 305    * @return string
 306    *   A valid (X)HTML snippet, as a string.
 307    */
 308   public static function serialize(\DOMDocument $document) {
 309     $body_node = $document->getElementsByTagName('body')->item(0);
 310     $html = '';
 311
 312     if ($body_node !== NULL) {
 313       foreach ($body_node->getElementsByTagName('script') as $node) {
 314         static::escapeCdataElement($node);
 315       }
 316       foreach ($body_node->getElementsByTagName('style') as $node) {
 317         static::escapeCdataElement($node, '/*', '*/');
 318       }
 319       foreach ($body_node->childNodes as $node) {
 320         $html .= $document->saveXML($node);
 321       }
 322     }
 323     return $html;
 324   }
 325
 326   /**
 327    * Adds comments around a <!CDATA section in a \DOMNode.
 328    *
 329    * \DOMDocument::loadHTML() in \Drupal\Component\Utility\Html::load() makes
 330    * CDATA sections from the contents of inline script and style tags. This can
 331    * cause HTML4 browsers to throw exceptions.
 332    *
 333    * This function attempts to solve the problem by creating a
 334    * \DOMDocumentFragment to comment the CDATA tag.
 335    *
 336    * @param \DOMNode $node
 337    *   The element potentially containing a CDATA node.
 338    * @param string $comment_start
 339    *   (optional) A string to use as a comment start marker to escape the CDATA
 340    *   declaration. Defaults to '//'.
 341    * @param string $comment_end
 342    *   (optional) A string to use as a comment end marker to escape the CDATA
 343    *   declaration. Defaults to an empty string.
 344    */
 345   public static function escapeCdataElement(\DOMNode $node, $comment_start = '//', $comment_end = '') {
 346     foreach ($node->childNodes as $child_node) {
 347       if ($child_node instanceof \DOMCdataSection) {
 348         $embed_prefix = "\n<!--{$comment_start}--><![CDATA[{$comment_start} ><!--{$comment_end}\n";
 349         $embed_suffix = "\n{$comment_start}--><!]]>{$comment_end}\n";
 350
 351         // Prevent invalid cdata escaping as this would throw a DOM error.
 352         // This is the same behavior as found in libxml2.
 353         // Related W3C standard: http://www.w3.org/TR/REC-xml/#dt-cdsection
 354         // Fix explanation: http://wikipedia.org/wiki/CDATA#Nesting
 355         $data = str_replace(']]>', ']]]]><![CDATA[>', $child_node->data);
 356
 357         $fragment = $node->ownerDocument->createDocumentFragment();
 358         $fragment->appendXML($embed_prefix . $data . $embed_suffix);
 359         $node->appendChild($fragment);
 360         $node->removeChild($child_node);
 361       }
 362     }
 363   }
 364
 365   /**
 366    * Decodes all HTML entities including numerical ones to regular UTF-8 bytes.
 367    *
 368    * Double-escaped entities will only be decoded once ("&amp;lt;" becomes
 369    * "&lt;", not "<"). Be careful when using this function, as it will revert
 370    * previous sanitization efforts (&lt;script&gt; will become <script>).
 371    *
 372    * This method is not the opposite of Html::escape(). For example, this method
 373    * will convert "&eacute;" to "é", whereas Html::escape() will not convert "é"
 374    * to "&eacute;".
 375    *
 376    * @param string $text
 377    *   The text to decode entities in.
 378    *
 379    * @return string
 380    *   The input $text, with all HTML entities decoded once.
 381    *
 382    * @see html_entity_decode()
 383    * @see \Drupal\Component\Utility\Html::escape()
 384    */
 385   public static function decodeEntities($text) {
 386     return html_entity_decode($text, ENT_QUOTES, 'UTF-8');
 387   }
 388
 389   /**
 390    * Escapes text by converting special characters to HTML entities.
 391    *
 392    * This method escapes HTML for sanitization purposes by replacing the
 393    * following special characters with their HTML entity equivalents:
 394    * - & (ampersand) becomes &amp;
 395    * - " (double quote) becomes &quot;
 396    * - ' (single quote) becomes &#039;
 397    * - < (less than) becomes &lt;
 398    * - > (greater than) becomes &gt;
 399    * Special characters that have already been escaped will be double-escaped
 400    * (for example, "&lt;" becomes "&amp;lt;"), and invalid UTF-8 encoding
 401    * will be converted to the Unicode replacement character ("�").
 402    *
 403    * This method is not the opposite of Html::decodeEntities(). For example,
 404    * this method will not encode "é" to "&eacute;", whereas
 405    * Html::decodeEntities() will convert all HTML entities to UTF-8 bytes,
 406    * including "&eacute;" and "&lt;" to "é" and "<".
 407    *
 408    * When constructing @link theme_render render arrays @endlink passing the output of Html::escape() to
 409    * '#markup' is not recommended. Use the '#plain_text' key instead and the
 410    * renderer will autoescape the text.
 411    *
 412    * @param string $text
 413    *   The input text.
 414    *
 415    * @return string
 416    *   The text with all HTML special characters converted.
 417    *
 418    * @see htmlspecialchars()
 419    * @see \Drupal\Component\Utility\Html::decodeEntities()
 420    *
 421    * @ingroup sanitization
 422    */
 423   public static function escape($text) {
 424     return htmlspecialchars($text, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
 425   }
 426
 427   /**
 428    * Converts all root-relative URLs to absolute URLs.
 429    *
 430    * Does not change any existing protocol-relative or absolute URLs. Does not
 431    * change other relative URLs because they would result in different absolute
 432    * URLs depending on the current path. For example: when the same content
 433    * containing such a relative URL (for example 'image.png'), is served from
 434    * its canonical URL (for example 'http://example.com/some-article') or from
 435    * a listing or feed (for example 'http://example.com/all-articles') their
 436    * "current path" differs, resulting in different absolute URLs:
 437    * 'http://example.com/some-article/image.png' versus
 438    * 'http://example.com/all-articles/image.png'. Only one can be correct.
 439    * Therefore relative URLs that are not root-relative cannot be safely
 440    * transformed and should generally be avoided.
 441    *
 442    * Necessary for HTML that is served outside of a website, for example, RSS
 443    * and e-mail.
 444    *
 445    * @param string $html
 446    *   The partial (X)HTML snippet to load. Invalid markup will be corrected on
 447    *   import.
 448    * @param string $scheme_and_host
 449    *   The root URL, which has a URI scheme, host and optional port.
 450    *
 451    * @return string
 452    *   The updated (X)HTML snippet.
 453    */
 454   public static function transformRootRelativeUrlsToAbsolute($html, $scheme_and_host) {
 455     assert(empty(array_diff(array_keys(parse_url($scheme_and_host)), ["scheme", "host", "port"])), '$scheme_and_host contains scheme, host and port at most.');
 456     assert(isset(parse_url($scheme_and_host)["scheme"]), '$scheme_and_host is absolute and hence has a scheme.');
 457     assert(isset(parse_url($scheme_and_host)["host"]), '$base_url is absolute and hence has a host.');
 458
 459     $html_dom = Html::load($html);
 460     $xpath = new \DOMXpath($html_dom);
 461
 462     // Update all root-relative URLs to absolute URLs in the given HTML.
 463     foreach (static::$uriAttributes as $attr) {
 464       foreach ($xpath->query("//*[starts-with(@$attr, '/') and not(starts-with(@$attr, '//'))]") as $node) {
 465         $node->setAttribute($attr, $scheme_and_host . $node->getAttribute($attr));
 466       }
 467       foreach ($xpath->query("//*[@srcset]") as $node) {
 468         // @see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
 469         // @see https://html.spec.whatwg.org/multipage/embedded-content.html#image-candidate-string
 470         $image_candidate_strings = explode(',', $node->getAttribute('srcset'));
 471         $image_candidate_strings = array_map('trim', $image_candidate_strings);
 472         for ($i = 0; $i < count($image_candidate_strings); $i++) {
 473           $image_candidate_string = $image_candidate_strings[$i];
 474           if ($image_candidate_string[0] === '/' && $image_candidate_string[1] !== '/') {
 475             $image_candidate_strings[$i] = $scheme_and_host . $image_candidate_string;
 476           }
 477         }
 478         $node->setAttribute('srcset', implode(', ', $image_candidate_strings));
 479       }
 480     }
 481     return Html::serialize($html_dom);
 482   }
 483
 484 }