www.aleph1.co.uk Git - yaffs-website/blob - web/core/lib/Drupal/Component/Utility/Xss.php

   1 <?php
   2
   3 namespace Drupal\Component\Utility;
   4
   5 /**
   6  * Provides helper to filter for cross-site scripting.
   7  *
   8  * @ingroup utility
   9  */
  10 class Xss {
  11
  12   /**
  13    * The list of HTML tags allowed by filterAdmin().
  14    *
  15    * @var array
  16    *
  17    * @see \Drupal\Component\Utility\Xss::filterAdmin()
  18    */
  19   protected static $adminTags = ['a', 'abbr', 'acronym', 'address', 'article', 'aside', 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', 'caption', 'cite', 'code', 'col', 'colgroup', 'command', 'dd', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'mark', 'menu', 'meter', 'nav', 'ol', 'output', 'p', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'section', 'small', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr'];
  20
  21   /**
  22    * The default list of HTML tags allowed by filter().
  23    *
  24    * @var array
  25    *
  26    * @see \Drupal\Component\Utility\Xss::filter()
  27    */
  28   protected static $htmlTags = ['a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd'];
  29
  30   /**
  31    * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities.
  32    *
  33    * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses.
  34    * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html.
  35    *
  36    * This code does four things:
  37    * - Removes characters and constructs that can trick browsers.
  38    * - Makes sure all HTML entities are well-formed.
  39    * - Makes sure all HTML tags and attributes are well-formed.
  40    * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.
  41    *   javascript:).
  42    *
  43    * @param $string
  44    *   The string with raw HTML in it. It will be stripped of everything that
  45    *   can cause an XSS attack.
  46    * @param array $html_tags
  47    *   An array of HTML tags.
  48    *
  49    * @return string
  50    *   An XSS safe version of $string, or an empty string if $string is not
  51    *   valid UTF-8.
  52    *
  53    * @see \Drupal\Component\Utility\Unicode::validateUtf8()
  54    *
  55    * @ingroup sanitization
  56    */
  57   public static function filter($string, array $html_tags = NULL) {
  58     if (is_null($html_tags)) {
  59       $html_tags = static::$htmlTags;
  60     }
  61     // Only operate on valid UTF-8 strings. This is necessary to prevent cross
  62     // site scripting issues on Internet Explorer 6.
  63     if (!Unicode::validateUtf8($string)) {
  64       return '';
  65     }
  66     // Remove NULL characters (ignored by some browsers).
  67     $string = str_replace(chr(0), '', $string);
  68     // Remove Netscape 4 JS entities.
  69     $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
  70
  71     // Defuse all HTML entities.
  72     $string = str_replace('&', '&amp;', $string);
  73     // Change back only well-formed entities in our whitelist:
  74     // Decimal numeric entities.
  75     $string = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $string);
  76     // Hexadecimal numeric entities.
  77     $string = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
  78     // Named entities.
  79     $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);
  80     $html_tags = array_flip($html_tags);
  81     // Late static binding does not work inside anonymous functions.
  82     $class = get_called_class();
  83     $splitter = function ($matches) use ($html_tags, $class) {
  84       return $class::split($matches[1], $html_tags, $class);
  85     };
  86     // Strip any tags that are not in the whitelist.
  87     return preg_replace_callback('%
  88       (
  89       <(?=[^a-zA-Z!/])  # a lone <
  90       |                 # or
  91       <!--.*?-->        # a comment
  92       |                 # or
  93       <[^>]*(>|$)       # a string that starts with a <, up until the > or the end of the string
  94       |                 # or
  95       >                 # just a >
  96       )%x', $splitter, $string);
  97   }
  98
  99   /**
 100    * Applies a very permissive XSS/HTML filter for admin-only use.
 101    *
 102    * Use only for fields where it is impractical to use the
 103    * whole filter system, but where some (mainly inline) mark-up
 104    * is desired (so \Drupal\Component\Utility\Html::escape() is
 105    * not acceptable).
 106    *
 107    * Allows all tags that can be used inside an HTML body, save
 108    * for scripts and styles.
 109    *
 110    * @param string $string
 111    *   The string to apply the filter to.
 112    *
 113    * @return string
 114    *   The filtered string.
 115    *
 116    * @ingroup sanitization
 117    *
 118    * @see \Drupal\Component\Utility\Xss::getAdminTagList()
 119    */
 120   public static function filterAdmin($string) {
 121     return static::filter($string, static::$adminTags);
 122   }
 123
 124   /**
 125    * Processes an HTML tag.
 126    *
 127    * @param string $string
 128    *   The HTML tag to process.
 129    * @param array $html_tags
 130    *   An array where the keys are the allowed tags and the values are not
 131    *   used.
 132    * @param string $class
 133    *   The called class. This method is called from an anonymous function which
 134    *   breaks late static binding. See https://bugs.php.net/bug.php?id=66622 for
 135    *   more information.
 136    *
 137    * @return string
 138    *   If the element isn't allowed, an empty string. Otherwise, the cleaned up
 139    *   version of the HTML element.
 140    */
 141   protected static function split($string, $html_tags, $class) {
 142     if (substr($string, 0, 1) != '<') {
 143       // We matched a lone ">" character.
 144       return '&gt;';
 145     }
 146     elseif (strlen($string) == 1) {
 147       // We matched a lone "<" character.
 148       return '&lt;';
 149     }
 150
 151     if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9\-]+)\s*([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
 152       // Seriously malformed.
 153       return '';
 154     }
 155     $slash = trim($matches[1]);
 156     $elem = &$matches[2];
 157     $attrlist = &$matches[3];
 158     $comment = &$matches[4];
 159
 160     if ($comment) {
 161       $elem = '!--';
 162     }
 163
 164     // When in whitelist mode, an element is disallowed when not listed.
 165     if ($class::needsRemoval($html_tags, $elem)) {
 166       return '';
 167     }
 168
 169     if ($comment) {
 170       return $comment;
 171     }
 172
 173     if ($slash != '') {
 174       return "</$elem>";
 175     }
 176
 177     // Is there a closing XHTML slash at the end of the attributes?
 178     $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count);
 179     $xhtml_slash = $count ? ' /' : '';
 180
 181     // Clean up attributes.
 182     $attr2 = implode(' ', $class::attributes($attrlist));
 183     $attr2 = preg_replace('/[<>]/', '', $attr2);
 184     $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
 185
 186     return "<$elem$attr2$xhtml_slash>";
 187   }
 188
 189   /**
 190    * Processes a string of HTML attributes.
 191    *
 192    * @param string $attributes
 193    *   The html attribute to process.
 194    *
 195    * @return string
 196    *   Cleaned up version of the HTML attributes.
 197    */
 198   protected static function attributes($attributes) {
 199     $attributes_array = [];
 200     $mode = 0;
 201     $attribute_name = '';
 202     $skip = FALSE;
 203     $skip_protocol_filtering = FALSE;
 204
 205     while (strlen($attributes) != 0) {
 206       // Was the last operation successful?
 207       $working = 0;
 208
 209       switch ($mode) {
 210         case 0:
 211           // Attribute name, href for instance.
 212           if (preg_match('/^([-a-zA-Z][-a-zA-Z0-9]*)/', $attributes, $match)) {
 213             $attribute_name = strtolower($match[1]);
 214             $skip = ($attribute_name == 'style' || substr($attribute_name, 0, 2) == 'on');
 215
 216             // Values for attributes of type URI should be filtered for
 217             // potentially malicious protocols (for example, an href-attribute
 218             // starting with "javascript:"). However, for some non-URI
 219             // attributes performing this filtering causes valid and safe data
 220             // to be mangled. We prevent this by skipping protocol filtering on
 221             // such attributes.
 222             // @see \Drupal\Component\Utility\UrlHelper::filterBadProtocol()
 223             // @see http://www.w3.org/TR/html4/index/attributes.html
 224             $skip_protocol_filtering = substr($attribute_name, 0, 5) === 'data-' || in_array($attribute_name, [
 225               'title',
 226               'alt',
 227               'rel',
 228               'property',
 229             ]);
 230
 231             $working = $mode = 1;
 232             $attributes = preg_replace('/^[-a-zA-Z][-a-zA-Z0-9]*/', '', $attributes);
 233           }
 234           break;
 235
 236         case 1:
 237           // Equals sign or valueless ("selected").
 238           if (preg_match('/^\s*=\s*/', $attributes)) {
 239             $working = 1; $mode = 2;
 240             $attributes = preg_replace('/^\s*=\s*/', '', $attributes);
 241             break;
 242           }
 243
 244           if (preg_match('/^\s+/', $attributes)) {
 245             $working = 1; $mode = 0;
 246             if (!$skip) {
 247               $attributes_array[] = $attribute_name;
 248             }
 249             $attributes = preg_replace('/^\s+/', '', $attributes);
 250           }
 251           break;
 252
 253         case 2:
 254           // Attribute value, a URL after href= for instance.
 255           if (preg_match('/^"([^"]*)"(\s+|$)/', $attributes, $match)) {
 256             $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
 257
 258             if (!$skip) {
 259               $attributes_array[] = "$attribute_name=\"$thisval\"";
 260             }
 261             $working = 1;
 262             $mode = 0;
 263             $attributes = preg_replace('/^"[^"]*"(\s+|$)/', '', $attributes);
 264             break;
 265           }
 266
 267           if (preg_match("/^'([^']*)'(\s+|$)/", $attributes, $match)) {
 268             $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
 269
 270             if (!$skip) {
 271               $attributes_array[] = "$attribute_name='$thisval'";
 272             }
 273             $working = 1; $mode = 0;
 274             $attributes = preg_replace("/^'[^']*'(\s+|$)/", '', $attributes);
 275             break;
 276           }
 277
 278           if (preg_match("%^([^\s\"']+)(\s+|$)%", $attributes, $match)) {
 279             $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
 280
 281             if (!$skip) {
 282               $attributes_array[] = "$attribute_name=\"$thisval\"";
 283             }
 284             $working = 1; $mode = 0;
 285             $attributes = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attributes);
 286           }
 287           break;
 288       }
 289
 290       if ($working == 0) {
 291         // Not well formed; remove and try again.
 292         $attributes = preg_replace('/
 293           ^
 294           (
 295           "[^"]*("|$)     # - a string that starts with a double quote, up until the next double quote or the end of the string
 296           |               # or
 297           \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
 298           |               # or
 299           \S              # - a non-whitespace character
 300           )*              # any number of the above three
 301           \s*             # any number of whitespaces
 302           /x', '', $attributes);
 303         $mode = 0;
 304       }
 305     }
 306
 307     // The attribute list ends with a valueless attribute like "selected".
 308     if ($mode == 1 && !$skip) {
 309       $attributes_array[] = $attribute_name;
 310     }
 311     return $attributes_array;
 312   }
 313
 314   /**
 315    * Whether this element needs to be removed altogether.
 316    *
 317    * @param $html_tags
 318    *   The list of HTML tags.
 319    * @param $elem
 320    *   The name of the HTML element.
 321    *
 322    * @return bool
 323    *   TRUE if this element needs to be removed.
 324    */
 325   protected static function needsRemoval($html_tags, $elem) {
 326     return !isset($html_tags[strtolower($elem)]);
 327   }
 328
 329   /**
 330    * Gets the list of HTML tags allowed by Xss::filterAdmin().
 331    *
 332    * @return array
 333    *   The list of HTML tags allowed by filterAdmin().
 334    */
 335   public static function getAdminTagList() {
 336     return static::$adminTags;
 337   }
 338
 339   /**
 340    * Gets the standard list of HTML tags allowed by Xss::filter().
 341    *
 342    * @return array
 343    *   The list of HTML tags allowed by Xss::filter().
 344    */
 345   public static function getHtmlTagList() {
 346     return static::$htmlTags;
 347   }
 348
 349 }