MediaWiki  master
Sanitizer.php
Go to the documentation of this file.
00001 <?php
00031 class Sanitizer {
00036         const CHAR_REFS_REGEX =
00037                 '/&([A-Za-z0-9\x80-\xff]+);
00038                  |&\#([0-9]+);
00039                  |&\#[xX]([0-9A-Fa-f]+);
00040                  |(&)/x';
00041 
00050         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
00051         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
00052 
00059         static $htmlEntities = array(
00060                 'Aacute'   => 193,
00061                 'aacute'   => 225,
00062                 'Acirc'    => 194,
00063                 'acirc'    => 226,
00064                 'acute'    => 180,
00065                 'AElig'    => 198,
00066                 'aelig'    => 230,
00067                 'Agrave'   => 192,
00068                 'agrave'   => 224,
00069                 'alefsym'  => 8501,
00070                 'Alpha'    => 913,
00071                 'alpha'    => 945,
00072                 'amp'      => 38,
00073                 'and'      => 8743,
00074                 'ang'      => 8736,
00075                 'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
00076                 'Aring'    => 197,
00077                 'aring'    => 229,
00078                 'asymp'    => 8776,
00079                 'Atilde'   => 195,
00080                 'atilde'   => 227,
00081                 'Auml'     => 196,
00082                 'auml'     => 228,
00083                 'bdquo'    => 8222,
00084                 'Beta'     => 914,
00085                 'beta'     => 946,
00086                 'brvbar'   => 166,
00087                 'bull'     => 8226,
00088                 'cap'      => 8745,
00089                 'Ccedil'   => 199,
00090                 'ccedil'   => 231,
00091                 'cedil'    => 184,
00092                 'cent'     => 162,
00093                 'Chi'      => 935,
00094                 'chi'      => 967,
00095                 'circ'     => 710,
00096                 'clubs'    => 9827,
00097                 'cong'     => 8773,
00098                 'copy'     => 169,
00099                 'crarr'    => 8629,
00100                 'cup'      => 8746,
00101                 'curren'   => 164,
00102                 'dagger'   => 8224,
00103                 'Dagger'   => 8225,
00104                 'darr'     => 8595,
00105                 'dArr'     => 8659,
00106                 'deg'      => 176,
00107                 'Delta'    => 916,
00108                 'delta'    => 948,
00109                 'diams'    => 9830,
00110                 'divide'   => 247,
00111                 'Eacute'   => 201,
00112                 'eacute'   => 233,
00113                 'Ecirc'    => 202,
00114                 'ecirc'    => 234,
00115                 'Egrave'   => 200,
00116                 'egrave'   => 232,
00117                 'empty'    => 8709,
00118                 'emsp'     => 8195,
00119                 'ensp'     => 8194,
00120                 'Epsilon'  => 917,
00121                 'epsilon'  => 949,
00122                 'equiv'    => 8801,
00123                 'Eta'      => 919,
00124                 'eta'      => 951,
00125                 'ETH'      => 208,
00126                 'eth'      => 240,
00127                 'Euml'     => 203,
00128                 'euml'     => 235,
00129                 'euro'     => 8364,
00130                 'exist'    => 8707,
00131                 'fnof'     => 402,
00132                 'forall'   => 8704,
00133                 'frac12'   => 189,
00134                 'frac14'   => 188,
00135                 'frac34'   => 190,
00136                 'frasl'    => 8260,
00137                 'Gamma'    => 915,
00138                 'gamma'    => 947,
00139                 'ge'       => 8805,
00140                 'gt'       => 62,
00141                 'harr'     => 8596,
00142                 'hArr'     => 8660,
00143                 'hearts'   => 9829,
00144                 'hellip'   => 8230,
00145                 'Iacute'   => 205,
00146                 'iacute'   => 237,
00147                 'Icirc'    => 206,
00148                 'icirc'    => 238,
00149                 'iexcl'    => 161,
00150                 'Igrave'   => 204,
00151                 'igrave'   => 236,
00152                 'image'    => 8465,
00153                 'infin'    => 8734,
00154                 'int'      => 8747,
00155                 'Iota'     => 921,
00156                 'iota'     => 953,
00157                 'iquest'   => 191,
00158                 'isin'     => 8712,
00159                 'Iuml'     => 207,
00160                 'iuml'     => 239,
00161                 'Kappa'    => 922,
00162                 'kappa'    => 954,
00163                 'Lambda'   => 923,
00164                 'lambda'   => 955,
00165                 'lang'     => 9001,
00166                 'laquo'    => 171,
00167                 'larr'     => 8592,
00168                 'lArr'     => 8656,
00169                 'lceil'    => 8968,
00170                 'ldquo'    => 8220,
00171                 'le'       => 8804,
00172                 'lfloor'   => 8970,
00173                 'lowast'   => 8727,
00174                 'loz'      => 9674,
00175                 'lrm'      => 8206,
00176                 'lsaquo'   => 8249,
00177                 'lsquo'    => 8216,
00178                 'lt'       => 60,
00179                 'macr'     => 175,
00180                 'mdash'    => 8212,
00181                 'micro'    => 181,
00182                 'middot'   => 183,
00183                 'minus'    => 8722,
00184                 'Mu'       => 924,
00185                 'mu'       => 956,
00186                 'nabla'    => 8711,
00187                 'nbsp'     => 160,
00188                 'ndash'    => 8211,
00189                 'ne'       => 8800,
00190                 'ni'       => 8715,
00191                 'not'      => 172,
00192                 'notin'    => 8713,
00193                 'nsub'     => 8836,
00194                 'Ntilde'   => 209,
00195                 'ntilde'   => 241,
00196                 'Nu'       => 925,
00197                 'nu'       => 957,
00198                 'Oacute'   => 211,
00199                 'oacute'   => 243,
00200                 'Ocirc'    => 212,
00201                 'ocirc'    => 244,
00202                 'OElig'    => 338,
00203                 'oelig'    => 339,
00204                 'Ograve'   => 210,
00205                 'ograve'   => 242,
00206                 'oline'    => 8254,
00207                 'Omega'    => 937,
00208                 'omega'    => 969,
00209                 'Omicron'  => 927,
00210                 'omicron'  => 959,
00211                 'oplus'    => 8853,
00212                 'or'       => 8744,
00213                 'ordf'     => 170,
00214                 'ordm'     => 186,
00215                 'Oslash'   => 216,
00216                 'oslash'   => 248,
00217                 'Otilde'   => 213,
00218                 'otilde'   => 245,
00219                 'otimes'   => 8855,
00220                 'Ouml'     => 214,
00221                 'ouml'     => 246,
00222                 'para'     => 182,
00223                 'part'     => 8706,
00224                 'permil'   => 8240,
00225                 'perp'     => 8869,
00226                 'Phi'      => 934,
00227                 'phi'      => 966,
00228                 'Pi'       => 928,
00229                 'pi'       => 960,
00230                 'piv'      => 982,
00231                 'plusmn'   => 177,
00232                 'pound'    => 163,
00233                 'prime'    => 8242,
00234                 'Prime'    => 8243,
00235                 'prod'     => 8719,
00236                 'prop'     => 8733,
00237                 'Psi'      => 936,
00238                 'psi'      => 968,
00239                 'quot'     => 34,
00240                 'radic'    => 8730,
00241                 'rang'     => 9002,
00242                 'raquo'    => 187,
00243                 'rarr'     => 8594,
00244                 'rArr'     => 8658,
00245                 'rceil'    => 8969,
00246                 'rdquo'    => 8221,
00247                 'real'     => 8476,
00248                 'reg'      => 174,
00249                 'rfloor'   => 8971,
00250                 'Rho'      => 929,
00251                 'rho'      => 961,
00252                 'rlm'      => 8207,
00253                 'rsaquo'   => 8250,
00254                 'rsquo'    => 8217,
00255                 'sbquo'    => 8218,
00256                 'Scaron'   => 352,
00257                 'scaron'   => 353,
00258                 'sdot'     => 8901,
00259                 'sect'     => 167,
00260                 'shy'      => 173,
00261                 'Sigma'    => 931,
00262                 'sigma'    => 963,
00263                 'sigmaf'   => 962,
00264                 'sim'      => 8764,
00265                 'spades'   => 9824,
00266                 'sub'      => 8834,
00267                 'sube'     => 8838,
00268                 'sum'      => 8721,
00269                 'sup'      => 8835,
00270                 'sup1'     => 185,
00271                 'sup2'     => 178,
00272                 'sup3'     => 179,
00273                 'supe'     => 8839,
00274                 'szlig'    => 223,
00275                 'Tau'      => 932,
00276                 'tau'      => 964,
00277                 'there4'   => 8756,
00278                 'Theta'    => 920,
00279                 'theta'    => 952,
00280                 'thetasym' => 977,
00281                 'thinsp'   => 8201,
00282                 'THORN'    => 222,
00283                 'thorn'    => 254,
00284                 'tilde'    => 732,
00285                 'times'    => 215,
00286                 'trade'    => 8482,
00287                 'Uacute'   => 218,
00288                 'uacute'   => 250,
00289                 'uarr'     => 8593,
00290                 'uArr'     => 8657,
00291                 'Ucirc'    => 219,
00292                 'ucirc'    => 251,
00293                 'Ugrave'   => 217,
00294                 'ugrave'   => 249,
00295                 'uml'      => 168,
00296                 'upsih'    => 978,
00297                 'Upsilon'  => 933,
00298                 'upsilon'  => 965,
00299                 'Uuml'     => 220,
00300                 'uuml'     => 252,
00301                 'weierp'   => 8472,
00302                 'Xi'       => 926,
00303                 'xi'       => 958,
00304                 'Yacute'   => 221,
00305                 'yacute'   => 253,
00306                 'yen'      => 165,
00307                 'Yuml'     => 376,
00308                 'yuml'     => 255,
00309                 'Zeta'     => 918,
00310                 'zeta'     => 950,
00311                 'zwj'      => 8205,
00312                 'zwnj'     => 8204
00313         );
00314 
00318         static $htmlEntityAliases = array(
00319                 'רלמ' => 'rlm',
00320                 'رلم' => 'rlm',
00321         );
00322 
00326         static $attribsRegex;
00327 
00333         static function getAttribsRegex() {
00334                 if ( self::$attribsRegex === null ) {
00335                         $attribFirst = '[:A-Z_a-z0-9]';
00336                         $attrib = '[:A-Z_a-z-.0-9]';
00337                         $space = '[\x09\x0a\x0d\x20]';
00338                         self::$attribsRegex =
00339                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
00340                                   ($space*=$space*
00341                                         (?:
00342                                          # The attribute value: quoted or alone
00343                                           \"([^<\"]*)\"
00344                                          | '([^<']*)'
00345                                          |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00346                                          |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00347                                                                                  # colors are specified like this.
00348                                                                                  # We'll be normalizing it.
00349                                         )
00350                                 )?(?=$space|\$)/sx";
00351                 }
00352                 return self::$attribsRegex;
00353         }
00354 
00366         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
00367                 global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
00368 
00369                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00370                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
00371 
00372                 wfProfileIn( __METHOD__ );
00373 
00374                 // Base our staticInitialised variable off of the global config state so that if the globals
00375                 // are changed (like in the secrewed up test system) we will re-initialise the settings.
00376                 $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
00377                 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
00378 
00379                         $htmlpairsStatic = array( # Tags that must be closed
00380                                 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00381                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00382                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
00383                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00384                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
00385                                 'kbd', 'samp'
00386                         );
00387                         if ( $wgHtml5 ) {
00388                                 $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
00389                         }
00390                         $htmlsingle = array(
00391                                 'br', 'hr', 'li', 'dt', 'dd'
00392                         );
00393                         $htmlsingleonly = array( # Elements that cannot have close tags
00394                                 'br', 'hr'
00395                         );
00396                         if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
00397                                 $htmlsingle[] = $htmlsingleonly[] = 'meta';
00398                                 $htmlsingle[] = $htmlsingleonly[] = 'link';
00399                         }
00400                         $htmlnest = array( # Tags that can be nested--??
00401                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00402                                 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
00403                         );
00404                         $tabletags = array( # Can only appear inside table, we will close them
00405                                 'td', 'th', 'tr',
00406                         );
00407                         $htmllist = array( # Tags used by list
00408                                 'ul','ol',
00409                         );
00410                         $listtags = array( # Tags that can appear in a list
00411                                 'li',
00412                         );
00413 
00414                         if ( $wgAllowImageTag ) {
00415                                 $htmlsingle[] = 'img';
00416                                 $htmlsingleonly[] = 'img';
00417                         }
00418 
00419                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
00420                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
00421 
00422                         # Convert them all to hashtables for faster lookup
00423                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00424                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
00425                         foreach ( $vars as $var ) {
00426                                 $$var = array_flip( $$var );
00427                         }
00428                         $staticInitialised = $globalContext;
00429                 }
00430                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
00431                 $extratags = array_flip( $extratags );
00432                 $removetags = array_flip( $removetags );
00433                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
00434                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
00435 
00436                 # Remove HTML comments
00437                 $text = Sanitizer::removeHTMLcomments( $text );
00438                 $bits = explode( '<', $text );
00439                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
00440                 if ( !$wgUseTidy ) {
00441                         $tagstack = $tablestack = array();
00442                         foreach ( $bits as $x ) {
00443                                 $regs = array();
00444                                 # $slash: Does the current element start with a '/'?
00445                                 # $t: Current element name
00446                                 # $params: String between element name and >
00447                                 # $brace: Ending '>' or '/>'
00448                                 # $rest: Everything until the next element of $bits
00449                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00450                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00451                                 } else {
00452                                         $slash = $t = $params = $brace = $rest = null;
00453                                 }
00454 
00455                                 $badtag = false;
00456                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00457                                         # Check our stack
00458                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
00459                                                 $badtag = true;
00460                                         } elseif ( $slash ) {
00461                                                 # Closing a tag... is it the one we just opened?
00462                                                 $ot = @array_pop( $tagstack );
00463                                                 if ( $ot != $t ) {
00464                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
00465                                                                 # Pop all elements with an optional close tag
00466                                                                 # and see if we find a match below them
00467                                                                 $optstack = array();
00468                                                                 array_push( $optstack, $ot );
00469                                                                 wfSuppressWarnings();
00470                                                                 $ot = array_pop( $tagstack );
00471                                                                 wfRestoreWarnings();
00472                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
00473                                                                         array_push( $optstack, $ot );
00474                                                                         wfSuppressWarnings();
00475                                                                         $ot = array_pop( $tagstack );
00476                                                                         wfRestoreWarnings();
00477                                                                 }
00478                                                                 if ( $t != $ot ) {
00479                                                                         # No match. Push the optional elements back again
00480                                                                         $badtag = true;
00481                                                                         wfSuppressWarnings();
00482                                                                         $ot = array_pop( $optstack );
00483                                                                         wfRestoreWarnings();
00484                                                                         while ( $ot ) {
00485                                                                                 array_push( $tagstack, $ot );
00486                                                                                 wfSuppressWarnings();
00487                                                                                 $ot = array_pop( $optstack );
00488                                                                                 wfRestoreWarnings();
00489                                                                         }
00490                                                                 }
00491                                                         } else {
00492                                                                 @array_push( $tagstack, $ot );
00493                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
00494                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
00495                                                                         $badtag = true;
00496                                                                 }
00497                                                         }
00498                                                 } else {
00499                                                         if ( $t == 'table' ) {
00500                                                                 $tagstack = array_pop( $tablestack );
00501                                                         }
00502                                                 }
00503                                                 $newparams = '';
00504                                         } else {
00505                                                 # Keep track for later
00506                                                 if ( isset( $tabletags[$t] ) &&
00507                                                 !in_array( 'table', $tagstack ) ) {
00508                                                         $badtag = true;
00509                                                 } elseif ( in_array( $t, $tagstack ) &&
00510                                                 !isset( $htmlnest [$t ] ) ) {
00511                                                         $badtag = true;
00512                                                 # Is it a self closed htmlpair ? (bug 5487)
00513                                                 } elseif ( $brace == '/>' &&
00514                                                 isset( $htmlpairs[$t] ) ) {
00515                                                         $badtag = true;
00516                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
00517                                                         # Hack to force empty tag for uncloseable elements
00518                                                         $brace = '/>';
00519                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
00520                                                         # Hack to not close $htmlsingle tags
00521                                                         $brace = null;
00522                                                         # Still need to push this optionally-closed tag to
00523                                                         # the tag stack so that we can match end tags
00524                                                         # instead of marking them as bad.
00525                                                         array_push( $tagstack, $t );
00526                                                 } elseif ( isset( $tabletags[$t] )
00527                                                 && in_array( $t, $tagstack ) ) {
00528                                                         // New table tag but forgot to close the previous one
00529                                                         $text .= "</$t>";
00530                                                 } else {
00531                                                         if ( $t == 'table' ) {
00532                                                                 array_push( $tablestack, $tagstack );
00533                                                                 $tagstack = array();
00534                                                         }
00535                                                         array_push( $tagstack, $t );
00536                                                 }
00537 
00538                                                 # Replace any variables or template parameters with
00539                                                 # plaintext results.
00540                                                 if( is_callable( $processCallback ) ) {
00541                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
00542                                                 }
00543 
00544                                                 if ( !Sanitizer::validateTag( $params, $t ) ) {
00545                                                         $badtag = true;
00546                                                 }
00547 
00548                                                 # Strip non-approved attributes from the tag
00549                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00550                                         }
00551                                         if ( !$badtag ) {
00552                                                 $rest = str_replace( '>', '&gt;', $rest );
00553                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00554                                                 $text .= "<$slash$t$newparams$close>$rest";
00555                                                 continue;
00556                                         }
00557                                 }
00558                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00559                         }
00560                         # Close off any remaining tags
00561                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00562                                 $text .= "</$t>\n";
00563                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00564                         }
00565                 } else {
00566                         # this might be possible using tidy itself
00567                         foreach ( $bits as $x ) {
00568                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00569                                 $x, $regs );
00570                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
00571                                 $badtag = false;
00572                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00573                                         if( is_callable( $processCallback ) ) {
00574                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
00575                                         }
00576 
00577                                         if ( !Sanitizer::validateTag( $params, $t ) ) {
00578                                                 $badtag = true;
00579                                         }
00580 
00581                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
00582                                         if ( !$badtag ) {
00583                                                 $rest = str_replace( '>', '&gt;', $rest );
00584                                                 $text .= "<$slash$t$newparams$brace$rest";
00585                                                 continue;
00586                                         }
00587                                 }
00588                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
00589                         }
00590                 }
00591                 wfProfileOut( __METHOD__ );
00592                 return $text;
00593         }
00594 
00605         static function removeHTMLcomments( $text ) {
00606                 wfProfileIn( __METHOD__ );
00607                 while (($start = strpos($text, '<!--')) !== false) {
00608                         $end = strpos($text, '-->', $start + 4);
00609                         if ($end === false) {
00610                                 # Unterminated comment; bail out
00611                                 break;
00612                         }
00613 
00614                         $end += 3;
00615 
00616                         # Trim space and newline if the comment is both
00617                         # preceded and followed by a newline
00618                         $spaceStart = max($start - 1, 0);
00619                         $spaceLen = $end - $spaceStart;
00620                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00621                                 $spaceStart--;
00622                                 $spaceLen++;
00623                         }
00624                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00625                                 $spaceLen++;
00626                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00627                                 # Remove the comment, leading and trailing
00628                                 # spaces, and leave only one newline.
00629                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00630                         }
00631                         else {
00632                                 # Remove just the comment.
00633                                 $text = substr_replace($text, '', $start, $end - $start);
00634                         }
00635                 }
00636                 wfProfileOut( __METHOD__ );
00637                 return $text;
00638         }
00639 
00651         static function validateTag( $params, $element ) {
00652                 $params = Sanitizer::decodeTagAttributes( $params );
00653 
00654                 if ( $element == 'meta' || $element == 'link' ) {
00655                         if ( !isset( $params['itemprop'] ) ) {
00656                                 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
00657                                 return false;
00658                         }
00659                         if ( $element == 'meta' && !isset( $params['content'] ) ) {
00660                                 // <meta> must have a content="" for the itemprop
00661                                 return false;
00662                         }
00663                         if ( $element == 'link' && !isset( $params['href'] ) ) {
00664                                 // <link> must have an associated href=""
00665                                 return false;
00666                         }
00667                 }
00668 
00669                 return true;
00670         }
00671 
00687         static function validateTagAttributes( $attribs, $element ) {
00688                 return Sanitizer::validateAttributes( $attribs,
00689                         Sanitizer::attributeWhitelist( $element ) );
00690         }
00691 
00707         static function validateAttributes( $attribs, $whitelist ) {
00708                 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
00709 
00710                 $whitelist = array_flip( $whitelist );
00711                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
00712 
00713                 $out = array();
00714                 foreach( $attribs as $attribute => $value ) {
00715                         #allow XML namespace declaration if RDFa is enabled
00716                         if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
00717                                 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00718                                         $out[$attribute] = $value;
00719                                 }
00720 
00721                                 continue;
00722                         }
00723 
00724                         # Allow any attribute beginning with "data-", if in HTML5 mode
00725                         if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) {
00726                                 continue;
00727                         }
00728 
00729                         # Strip javascript "expression" from stylesheets.
00730                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00731                         if( $attribute == 'style' ) {
00732                                 $value = Sanitizer::checkCss( $value );
00733                         }
00734 
00735                         if ( $attribute === 'id' ) {
00736                                 $value = Sanitizer::escapeId( $value, 'noninitial' );
00737                         }
00738 
00739                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
00740                         if ( $attribute === 'rel' || $attribute === 'rev' ||
00741                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
00742                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
00743                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
00744                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
00745 
00746                                 //Paranoia. Allow "simple" values but suppress javascript
00747                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
00748                                         continue;
00749                                 }
00750                         }
00751 
00752                         # NOTE: even though elements using href/src are not allowed directly, supply
00753                         #       validation code that can be used by tag hook handlers, etc
00754                         if ( $attribute === 'href' || $attribute === 'src' ) {
00755                                 if ( !preg_match( $hrefExp, $value ) ) {
00756                                         continue; //drop any href or src attributes not using an allowed protocol.
00757                                                   //NOTE: this also drops all relative URLs
00758                                 }
00759                         }
00760 
00761                         // If this attribute was previously set, override it.
00762                         // Output should only have one attribute of each name.
00763                         $out[$attribute] = $value;
00764                 }
00765 
00766                 if ( $wgAllowMicrodataAttributes ) {
00767                         # itemtype, itemid, itemref don't make sense without itemscope
00768                         if ( !array_key_exists( 'itemscope', $out ) ) {
00769                                 unset( $out['itemtype'] );
00770                                 unset( $out['itemid'] );
00771                                 unset( $out['itemref'] );
00772                         }
00773                         # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
00774                 }
00775                 return $out;
00776         }
00777 
00788         static function mergeAttributes( $a, $b ) {
00789                 $out = array_merge( $a, $b );
00790                 if( isset( $a['class'] ) && isset( $b['class'] )
00791                 && is_string( $a['class'] ) && is_string( $b['class'] )
00792                 && $a['class'] !== $b['class'] ) {
00793                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00794                                 -1, PREG_SPLIT_NO_EMPTY );
00795                         $out['class'] = implode( ' ', array_unique( $classes ) );
00796                 }
00797                 return $out;
00798         }
00799 
00817         static function checkCss( $value ) {
00818                 // Decode character references like &#123;
00819                 $value = Sanitizer::decodeCharReferences( $value );
00820 
00821                 // Decode escape sequences and line continuation
00822                 // See the grammar in the CSS 2 spec, appendix D.
00823                 // This has to be done AFTER decoding character references.
00824                 // This means it isn't possible for this function to return
00825                 // unsanitized escape sequences. It is possible to manufacture
00826                 // input that contains character references that decode to
00827                 // escape sequences that decode to character references, but
00828                 // it's OK for the return value to contain character references
00829                 // because the caller is supposed to escape those anyway.
00830                 static $decodeRegex;
00831                 if ( !$decodeRegex ) {
00832                         $space = '[\\x20\\t\\r\\n\\f]';
00833                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
00834                         $backslash = '\\\\';
00835                         $decodeRegex = "/ $backslash
00836                                 (?:
00837                                         ($nl) |  # 1. Line continuation
00838                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
00839                                         (.) | # 3. backslash cancelling special meaning
00840                                         () | # 4. backslash at end of string
00841                                 )/xu";
00842                 }
00843                 $value = preg_replace_callback( $decodeRegex,
00844                         array( __CLASS__, 'cssDecodeCallback' ), $value );
00845 
00846                 // Remove any comments; IE gets token splitting wrong
00847                 // This must be done AFTER decoding character references and
00848                 // escape sequences, because those steps can introduce comments
00849                 // This step cannot introduce character references or escape
00850                 // sequences, because it replaces comments with spaces rather
00851                 // than removing them completely.
00852                 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
00853 
00854                 // Remove anything after a comment-start token, to guard against
00855                 // incorrect client implementations.
00856                 $commentPos = strpos( $value, '/*' );
00857                 if ( $commentPos !== false ) {
00858                         $value = substr( $value, 0, $commentPos );
00859                 }
00860 
00861                 // Reject problematic keywords and control characters
00862                 if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
00863                         return '/* invalid control char */';
00864                 } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( | image\s*\( | image-set\s*\( !ix', $value ) ) {
00865                         return '/* insecure input */';
00866                 }
00867                 return $value;
00868         }
00869 
00874         static function cssDecodeCallback( $matches ) {
00875                 if ( $matches[1] !== '' ) {
00876                         // Line continuation
00877                         return '';
00878                 } elseif ( $matches[2] !== '' ) {
00879                         $char = codepointToUtf8( hexdec( $matches[2] ) );
00880                 } elseif ( $matches[3] !== '' ) {
00881                         $char = $matches[3];
00882                 } else {
00883                         $char = '\\';
00884                 }
00885                 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
00886                         // These characters need to be escaped in strings
00887                         // Clean up the escape sequence to avoid parsing errors by clients
00888                         return '\\' . dechex( ord( $char ) ) . ' ';
00889                 } else {
00890                         // Decode unnecessary escape
00891                         return $char;
00892                 }
00893         }
00894 
00914         static function fixTagAttributes( $text, $element ) {
00915                 if( trim( $text ) == '' ) {
00916                         return '';
00917                 }
00918 
00919                 $decoded = Sanitizer::decodeTagAttributes( $text );
00920                 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
00921 
00922                 $attribs = array();
00923                 foreach( $stripped as $attribute => $value ) {
00924                         $encAttribute = htmlspecialchars( $attribute );
00925                         $encValue = Sanitizer::safeEncodeAttribute( $value );
00926 
00927                         $attribs[] = "$encAttribute=\"$encValue\"";
00928                 }
00929                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00930         }
00931 
00937         static function encodeAttribute( $text ) {
00938                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
00939 
00940                 // Whitespace is normalized during attribute decoding,
00941                 // so if we've been passed non-spaces we must encode them
00942                 // ahead of time or they won't be preserved.
00943                 $encValue = strtr( $encValue, array(
00944                         "\n" => '&#10;',
00945                         "\r" => '&#13;',
00946                         "\t" => '&#9;',
00947                 ) );
00948 
00949                 return $encValue;
00950         }
00951 
00958         static function safeEncodeAttribute( $text ) {
00959                 $encValue = Sanitizer::encodeAttribute( $text );
00960 
00961                 # Templates and links may be expanded in later parsing,
00962                 # creating invalid or dangerous output. Suppress this.
00963                 $encValue = strtr( $encValue, array(
00964                         '<'    => '&lt;',   // This should never happen,
00965                         '>'    => '&gt;',   // we've received invalid input
00966                         '"'    => '&quot;', // which should have been escaped.
00967                         '{'    => '&#123;',
00968                         '['    => '&#91;',
00969                         "''"   => '&#39;&#39;',
00970                         'ISBN' => '&#73;SBN',
00971                         'RFC'  => '&#82;FC',
00972                         'PMID' => '&#80;MID',
00973                         '|'    => '&#124;',
00974                         '__'   => '&#95;_',
00975                 ) );
00976 
00977                 # Stupid hack
00978                 $encValue = preg_replace_callback(
00979                         '/((?i)' . wfUrlProtocols() . ')/',
00980                         array( 'Sanitizer', 'armorLinksCallback' ),
00981                         $encValue );
00982                 return $encValue;
00983         }
00984 
01016         static function escapeId( $id, $options = array() ) {
01017                 global $wgHtml5, $wgExperimentalHtmlIds;
01018                 $options = (array)$options;
01019 
01020                 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
01021                         $id = Sanitizer::decodeCharReferences( $id );
01022                         $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
01023                         $id = trim( $id, '_' );
01024                         if ( $id === '' ) {
01025                                 # Must have been all whitespace to start with.
01026                                 return '_';
01027                         } else {
01028                                 return $id;
01029                         }
01030                 }
01031 
01032                 # HTML4-style escaping
01033                 static $replace = array(
01034                         '%3A' => ':',
01035                         '%' => '.'
01036                 );
01037 
01038                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
01039                 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
01040 
01041                 if ( !preg_match( '/^[a-zA-Z]/', $id )
01042                 && !in_array( 'noninitial', $options ) )  {
01043                         // Initial character must be a letter!
01044                         $id = "x$id";
01045                 }
01046                 return $id;
01047         }
01048 
01060         static function escapeClass( $class ) {
01061                 // Convert ugly stuff to underscores and kill underscores in ugly places
01062                 return rtrim(preg_replace(
01063                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
01064                         '_',
01065                         $class ), '_');
01066         }
01067 
01075         static function escapeHtmlAllowEntities( $html ) {
01076                 $html = Sanitizer::decodeCharReferences( $html );
01077                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
01078                 # hurt.
01079                 $html = htmlspecialchars( $html, ENT_QUOTES );
01080                 return $html;
01081         }
01082 
01088         private static function armorLinksCallback( $matches ) {
01089                 return str_replace( ':', '&#58;', $matches[1] );
01090         }
01091 
01100         public static function decodeTagAttributes( $text ) {
01101                 if( trim( $text ) == '' ) {
01102                         return array();
01103                 }
01104 
01105                 $attribs = array();
01106                 $pairs = array();
01107                 if( !preg_match_all(
01108                         self::getAttribsRegex(),
01109                         $text,
01110                         $pairs,
01111                         PREG_SET_ORDER ) ) {
01112                         return $attribs;
01113                 }
01114 
01115                 foreach( $pairs as $set ) {
01116                         $attribute = strtolower( $set[1] );
01117                         $value = Sanitizer::getTagAttributeCallback( $set );
01118 
01119                         // Normalize whitespace
01120                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
01121                         $value = trim( $value );
01122 
01123                         // Decode character references
01124                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
01125                 }
01126                 return $attribs;
01127         }
01128 
01137         private static function getTagAttributeCallback( $set ) {
01138                 if( isset( $set[6] ) ) {
01139                         # Illegal #XXXXXX color with no quotes.
01140                         return $set[6];
01141                 } elseif( isset( $set[5] ) ) {
01142                         # No quotes.
01143                         return $set[5];
01144                 } elseif( isset( $set[4] ) ) {
01145                         # Single-quoted
01146                         return $set[4];
01147                 } elseif( isset( $set[3] ) ) {
01148                         # Double-quoted
01149                         return $set[3];
01150                 } elseif( !isset( $set[2] ) ) {
01151                         # In XHTML, attributes must have a value.
01152                         # For 'reduced' form, return explicitly the attribute name here.
01153                         return $set[1];
01154                 } else {
01155                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
01156                 }
01157         }
01158 
01170         private static function normalizeAttributeValue( $text ) {
01171                 return str_replace( '"', '&quot;',
01172                         self::normalizeWhitespace(
01173                                 Sanitizer::normalizeCharReferences( $text ) ) );
01174         }
01175 
01180         private static function normalizeWhitespace( $text ) {
01181                 return preg_replace(
01182                         '/\r\n|[\x20\x0d\x0a\x09]/',
01183                         ' ',
01184                         $text );
01185         }
01186 
01195         static function normalizeSectionNameWhitespace( $section ) {
01196                 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
01197         }
01198 
01214         static function normalizeCharReferences( $text ) {
01215                 return preg_replace_callback(
01216                         self::CHAR_REFS_REGEX,
01217                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
01218                         $text );
01219         }
01224         static function normalizeCharReferencesCallback( $matches ) {
01225                 $ret = null;
01226                 if( $matches[1] != '' ) {
01227                         $ret = Sanitizer::normalizeEntity( $matches[1] );
01228                 } elseif( $matches[2] != '' ) {
01229                         $ret = Sanitizer::decCharReference( $matches[2] );
01230                 } elseif( $matches[3] != ''  ) {
01231                         $ret = Sanitizer::hexCharReference( $matches[3] );
01232                 }
01233                 if( is_null( $ret ) ) {
01234                         return htmlspecialchars( $matches[0] );
01235                 } else {
01236                         return $ret;
01237                 }
01238         }
01239 
01250         static function normalizeEntity( $name ) {
01251                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01252                         return '&' . self::$htmlEntityAliases[$name] . ';';
01253                 } elseif ( in_array( $name,
01254                 array( 'lt', 'gt', 'amp', 'quot' ) ) ) {
01255                         return "&$name;";
01256                 } elseif ( isset( self::$htmlEntities[$name] ) ) {
01257                         return '&#' . self::$htmlEntities[$name] . ';';
01258                 } else {
01259                         return "&amp;$name;";
01260                 }
01261         }
01262 
01267         static function decCharReference( $codepoint ) {
01268                 $point = intval( $codepoint );
01269                 if( Sanitizer::validateCodepoint( $point ) ) {
01270                         return sprintf( '&#%d;', $point );
01271                 } else {
01272                         return null;
01273                 }
01274         }
01275 
01280         static function hexCharReference( $codepoint ) {
01281                 $point = hexdec( $codepoint );
01282                 if( Sanitizer::validateCodepoint( $point ) ) {
01283                         return sprintf( '&#x%x;', $point );
01284                 } else {
01285                         return null;
01286                 }
01287         }
01288 
01294         private static function validateCodepoint( $codepoint ) {
01295                 return ($codepoint ==    0x09)
01296                         || ($codepoint ==    0x0a)
01297                         || ($codepoint ==    0x0d)
01298                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
01299                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
01300                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01301         }
01302 
01310         public static function decodeCharReferences( $text ) {
01311                 return preg_replace_callback(
01312                         self::CHAR_REFS_REGEX,
01313                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01314                         $text );
01315         }
01316 
01327         public static function decodeCharReferencesAndNormalize( $text ) {
01328                 global $wgContLang;
01329                 $text = preg_replace_callback(
01330                         self::CHAR_REFS_REGEX,
01331                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01332                         $text, /* limit */ -1, $count );
01333 
01334                 if ( $count ) {
01335                         return $wgContLang->normalize( $text );
01336                 } else {
01337                         return $text;
01338                 }
01339         }
01340 
01345         static function decodeCharReferencesCallback( $matches ) {
01346                 if( $matches[1] != '' ) {
01347                         return Sanitizer::decodeEntity( $matches[1] );
01348                 } elseif( $matches[2] != '' ) {
01349                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
01350                 } elseif( $matches[3] != ''  ) {
01351                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
01352                 }
01353                 # Last case should be an ampersand by itself
01354                 return $matches[0];
01355         }
01356 
01364         static function decodeChar( $codepoint ) {
01365                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01366                         return codepointToUtf8( $codepoint );
01367                 } else {
01368                         return UTF8_REPLACEMENT;
01369                 }
01370         }
01371 
01380         static function decodeEntity( $name ) {
01381                 if ( isset( self::$htmlEntityAliases[$name] ) ) {
01382                         $name = self::$htmlEntityAliases[$name];
01383                 }
01384                 if( isset( self::$htmlEntities[$name] ) ) {
01385                         return codepointToUtf8( self::$htmlEntities[$name] );
01386                 } else {
01387                         return "&$name;";
01388                 }
01389         }
01390 
01397         static function attributeWhitelist( $element ) {
01398                 $list = Sanitizer::setupAttributeWhitelist();
01399                 return isset( $list[$element] )
01400                         ? $list[$element]
01401                         : array();
01402         }
01403 
01409         static function setupAttributeWhitelist() {
01410                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
01411 
01412                 static $whitelist, $staticInitialised;
01413                 $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
01414 
01415                 if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
01416                         return $whitelist;
01417                 }
01418 
01419                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01420 
01421                 if ( $wgAllowRdfaAttributes ) {
01422                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
01423                         $common = array_merge( $common, array(
01424                             'about', 'property', 'resource', 'datatype', 'typeof',
01425                         ) );
01426                 }
01427 
01428                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
01429                         # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
01430                         $common = array_merge( $common, array(
01431                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
01432                         ) );
01433                 }
01434 
01435                 $block = array_merge( $common, array( 'align' ) );
01436                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01437                 $tablecell = array( 'abbr',
01438                                     'axis',
01439                                     'headers',
01440                                     'scope',
01441                                     'rowspan',
01442                                     'colspan',
01443                                     'nowrap', # deprecated
01444                                     'width',  # deprecated
01445                                     'height', # deprecated
01446                                     'bgcolor' # deprecated
01447                                     );
01448 
01449                 # Numbers refer to sections in HTML 4.01 standard describing the element.
01450                 # See: http://www.w3.org/TR/html4/
01451                 $whitelist = array(
01452                         # 7.5.4
01453                         'div'        => $block,
01454                         'center'     => $common, # deprecated
01455                         'span'       => $block, # ??
01456 
01457                         # 7.5.5
01458                         'h1'         => $block,
01459                         'h2'         => $block,
01460                         'h3'         => $block,
01461                         'h4'         => $block,
01462                         'h5'         => $block,
01463                         'h6'         => $block,
01464 
01465                         # 7.5.6
01466                         # address
01467 
01468                         # 8.2.4
01469                         # bdo
01470 
01471                         # 9.2.1
01472                         'em'         => $common,
01473                         'strong'     => $common,
01474                         'cite'       => $common,
01475                         'dfn'        => $common,
01476                         'code'       => $common,
01477                         'samp'       => $common,
01478                         'kbd'        => $common,
01479                         'var'        => $common,
01480                         'abbr'       => $common,
01481                         # acronym
01482 
01483                         # 9.2.2
01484                         'blockquote' => array_merge( $common, array( 'cite' ) ),
01485                         # q
01486 
01487                         # 9.2.3
01488                         'sub'        => $common,
01489                         'sup'        => $common,
01490 
01491                         # 9.3.1
01492                         'p'          => $block,
01493 
01494                         # 9.3.2
01495                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
01496 
01497                         # 9.3.4
01498                         'pre'        => array_merge( $common, array( 'width' ) ),
01499 
01500                         # 9.4
01501                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01502                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
01503 
01504                         # 10.2
01505                         'ul'         => array_merge( $common, array( 'type' ) ),
01506                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
01507                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
01508 
01509                         # 10.3
01510                         'dl'         => $common,
01511                         'dd'         => $common,
01512                         'dt'         => $common,
01513 
01514                         # 11.2.1
01515                         'table'      => array_merge( $common,
01516                                                                 array( 'summary', 'width', 'border', 'frame',
01517                                                                                 'rules', 'cellspacing', 'cellpadding',
01518                                                                                 'align', 'bgcolor',
01519                                                                 ) ),
01520 
01521                         # 11.2.2
01522                         'caption'    => array_merge( $common, array( 'align' ) ),
01523 
01524                         # 11.2.3
01525                         'thead'      => array_merge( $common, $tablealign ),
01526                         'tfoot'      => array_merge( $common, $tablealign ),
01527                         'tbody'      => array_merge( $common, $tablealign ),
01528 
01529                         # 11.2.4
01530                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01531                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01532 
01533                         # 11.2.5
01534                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01535 
01536                         # 11.2.6
01537                         'td'         => array_merge( $common, $tablecell, $tablealign ),
01538                         'th'         => array_merge( $common, $tablecell, $tablealign ),
01539 
01540                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
01541                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
01542 
01543                         # 13.2
01544                         # Not usually allowed, but may be used for extension-style hooks
01545                         # such as <math> when it is rasterized, or if $wgAllowImageTag is
01546                         # true
01547                         'img'        => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ),
01548 
01549                         # 15.2.1
01550                         'tt'         => $common,
01551                         'b'          => $common,
01552                         'i'          => $common,
01553                         'big'        => $common,
01554                         'small'      => $common,
01555                         'strike'     => $common,
01556                         's'          => $common,
01557                         'u'          => $common,
01558 
01559                         # 15.2.2
01560                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
01561                         # basefont
01562 
01563                         # 15.3
01564                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01565 
01566                         # XHTML Ruby annotation text module, simple ruby only.
01567                         # http://www.w3c.org/TR/ruby/
01568                         'ruby'       => $common,
01569                         # rbc
01570                         # rtc
01571                         'rb'         => $common,
01572                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
01573                         'rp'         => $common,
01574 
01575                         # MathML root element, where used for extensions
01576                         # 'title' may not be 100% valid here; it's XHTML
01577                         # http://www.w3.org/TR/REC-MathML/
01578                         'math'       => array( 'class', 'style', 'id', 'title' ),
01579 
01580                         # HTML 5 section 4.6
01581                         'bdi' => $common,
01582 
01583                 );
01584 
01585                 if ( $wgHtml5 ) {
01586                         # HTML5 elements, defined by:
01587                         # http://www.whatwg.org/specs/web-apps/current-work/multipage/
01588                         $whitelist += array(
01589                                 'data' => array_merge( $common, array( 'value' ) ),
01590                                 'time' => array_merge( $common, array( 'datetime' ) ),
01591                                 'mark' => $common,
01592 
01593                                 // meta and link are only permitted by removeHTMLtags when Microdata
01594                                 // is enabled so we don't bother adding a conditional to hide these
01595                                 // Also meta and link are only valid in WikiText as Microdata elements
01596                                 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
01597                                 // So we don't bother including $common attributes that have no purpose.
01598                                 'meta' => array( 'itemprop', 'content' ),
01599                                 'link' => array( 'itemprop', 'href' ),
01600                         );
01601                 }
01602 
01603                 $staticInitialised = $globalContext;
01604 
01605                 return $whitelist;
01606         }
01607 
01618         static function stripAllTags( $text ) {
01619                 # Actual <tags>
01620                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01621 
01622                 # Normalize &entities and whitespace
01623                 $text = self::decodeCharReferences( $text );
01624                 $text = self::normalizeWhitespace( $text );
01625 
01626                 return $text;
01627         }
01628 
01638         static function hackDocType() {
01639                 $out = "<!DOCTYPE html [\n";
01640                 foreach( self::$htmlEntities as $entity => $codepoint ) {
01641                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01642                 }
01643                 $out .= "]>\n";
01644                 return $out;
01645         }
01646 
01651         static function cleanUrl( $url ) {
01652                 # Normalize any HTML entities in input. They will be
01653                 # re-escaped by makeExternalLink().
01654                 $url = Sanitizer::decodeCharReferences( $url );
01655 
01656                 # Escape any control characters introduced by the above step
01657                 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
01658                         array( __CLASS__, 'cleanUrlCallback' ), $url );
01659 
01660                 # Validate hostname portion
01661                 $matches = array();
01662                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01663                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
01664 
01665                         // Characters that will be ignored in IDNs.
01666                         // http://tools.ietf.org/html/3454#section-3.1
01667                         // Strip them before further processing so blacklists and such work.
01668                         $strip = "/
01669                                 \\s|          # general whitespace
01670                                 \xc2\xad|     # 00ad SOFT HYPHEN
01671                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01672                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01673                                 \xe2\x81\xa0| # 2060 WORD JOINER
01674                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01675                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
01676                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01677                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01678                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01679                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01680                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01681                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
01682                                 /xuD";
01683 
01684                         $host = preg_replace( $strip, '', $host );
01685 
01686                         // @todo FIXME: Validate hostnames here
01687 
01688                         return $protocol . $host . $rest;
01689                 } else {
01690                         return $url;
01691                 }
01692         }
01693 
01698         static function cleanUrlCallback( $matches ) {
01699                 return urlencode( $matches[0] );
01700         }
01701 
01730         public static function validateEmail( $addr ) {
01731                 $result = null;
01732                 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
01733                         return $result;
01734                 }
01735 
01736                 // Please note strings below are enclosed in brackets [], this make the
01737                 // hyphen "-" a range indicator. Hence it is double backslashed below.
01738                 // See bug 26948
01739                 $rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
01740                 $rfc1034_ldh_str = "a-z0-9\\-" ;
01741 
01742                 $HTML5_email_regexp = "/
01743                 ^                      # start of string
01744                 [$rfc5322_atext\\.]+    # user part which is liberal :p
01745                 @                      # 'apostrophe'
01746                 [$rfc1034_ldh_str]+       # First domain part
01747                 (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
01748                 $                      # End of string
01749                 /ix" ; // case Insensitive, eXtended
01750 
01751                 return (bool) preg_match( $HTML5_email_regexp, $addr );
01752         }
01753 }