MediaWiki
master
|
00001 <?php 00031 class Sanitizer { 00036 const CHAR_REFS_REGEX = 00037 '/&([A-Za-z0-9\x80-\xff]+); 00038 |&\#([0-9]+); 00039 |&\#[xX]([0-9A-Fa-f]+); 00040 |(&)/x'; 00041 00050 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; 00051 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; 00052 00059 static $htmlEntities = array( 00060 'Aacute' => 193, 00061 'aacute' => 225, 00062 'Acirc' => 194, 00063 'acirc' => 226, 00064 'acute' => 180, 00065 'AElig' => 198, 00066 'aelig' => 230, 00067 'Agrave' => 192, 00068 'agrave' => 224, 00069 'alefsym' => 8501, 00070 'Alpha' => 913, 00071 'alpha' => 945, 00072 'amp' => 38, 00073 'and' => 8743, 00074 'ang' => 8736, 00075 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. 00076 'Aring' => 197, 00077 'aring' => 229, 00078 'asymp' => 8776, 00079 'Atilde' => 195, 00080 'atilde' => 227, 00081 'Auml' => 196, 00082 'auml' => 228, 00083 'bdquo' => 8222, 00084 'Beta' => 914, 00085 'beta' => 946, 00086 'brvbar' => 166, 00087 'bull' => 8226, 00088 'cap' => 8745, 00089 'Ccedil' => 199, 00090 'ccedil' => 231, 00091 'cedil' => 184, 00092 'cent' => 162, 00093 'Chi' => 935, 00094 'chi' => 967, 00095 'circ' => 710, 00096 'clubs' => 9827, 00097 'cong' => 8773, 00098 'copy' => 169, 00099 'crarr' => 8629, 00100 'cup' => 8746, 00101 'curren' => 164, 00102 'dagger' => 8224, 00103 'Dagger' => 8225, 00104 'darr' => 8595, 00105 'dArr' => 8659, 00106 'deg' => 176, 00107 'Delta' => 916, 00108 'delta' => 948, 00109 'diams' => 9830, 00110 'divide' => 247, 00111 'Eacute' => 201, 00112 'eacute' => 233, 00113 'Ecirc' => 202, 00114 'ecirc' => 234, 00115 'Egrave' => 200, 00116 'egrave' => 232, 00117 'empty' => 8709, 00118 'emsp' => 8195, 00119 'ensp' => 8194, 00120 'Epsilon' => 917, 00121 'epsilon' => 949, 00122 'equiv' => 8801, 00123 'Eta' => 919, 00124 'eta' => 951, 00125 'ETH' => 208, 00126 'eth' => 240, 00127 'Euml' => 203, 00128 'euml' => 235, 00129 'euro' => 8364, 00130 'exist' => 8707, 00131 'fnof' => 402, 00132 'forall' => 8704, 00133 'frac12' => 189, 00134 'frac14' => 188, 00135 'frac34' => 190, 00136 'frasl' => 8260, 00137 'Gamma' => 915, 00138 'gamma' => 947, 00139 'ge' => 8805, 00140 'gt' => 62, 00141 'harr' => 8596, 00142 'hArr' => 8660, 00143 'hearts' => 9829, 00144 'hellip' => 8230, 00145 'Iacute' => 205, 00146 'iacute' => 237, 00147 'Icirc' => 206, 00148 'icirc' => 238, 00149 'iexcl' => 161, 00150 'Igrave' => 204, 00151 'igrave' => 236, 00152 'image' => 8465, 00153 'infin' => 8734, 00154 'int' => 8747, 00155 'Iota' => 921, 00156 'iota' => 953, 00157 'iquest' => 191, 00158 'isin' => 8712, 00159 'Iuml' => 207, 00160 'iuml' => 239, 00161 'Kappa' => 922, 00162 'kappa' => 954, 00163 'Lambda' => 923, 00164 'lambda' => 955, 00165 'lang' => 9001, 00166 'laquo' => 171, 00167 'larr' => 8592, 00168 'lArr' => 8656, 00169 'lceil' => 8968, 00170 'ldquo' => 8220, 00171 'le' => 8804, 00172 'lfloor' => 8970, 00173 'lowast' => 8727, 00174 'loz' => 9674, 00175 'lrm' => 8206, 00176 'lsaquo' => 8249, 00177 'lsquo' => 8216, 00178 'lt' => 60, 00179 'macr' => 175, 00180 'mdash' => 8212, 00181 'micro' => 181, 00182 'middot' => 183, 00183 'minus' => 8722, 00184 'Mu' => 924, 00185 'mu' => 956, 00186 'nabla' => 8711, 00187 'nbsp' => 160, 00188 'ndash' => 8211, 00189 'ne' => 8800, 00190 'ni' => 8715, 00191 'not' => 172, 00192 'notin' => 8713, 00193 'nsub' => 8836, 00194 'Ntilde' => 209, 00195 'ntilde' => 241, 00196 'Nu' => 925, 00197 'nu' => 957, 00198 'Oacute' => 211, 00199 'oacute' => 243, 00200 'Ocirc' => 212, 00201 'ocirc' => 244, 00202 'OElig' => 338, 00203 'oelig' => 339, 00204 'Ograve' => 210, 00205 'ograve' => 242, 00206 'oline' => 8254, 00207 'Omega' => 937, 00208 'omega' => 969, 00209 'Omicron' => 927, 00210 'omicron' => 959, 00211 'oplus' => 8853, 00212 'or' => 8744, 00213 'ordf' => 170, 00214 'ordm' => 186, 00215 'Oslash' => 216, 00216 'oslash' => 248, 00217 'Otilde' => 213, 00218 'otilde' => 245, 00219 'otimes' => 8855, 00220 'Ouml' => 214, 00221 'ouml' => 246, 00222 'para' => 182, 00223 'part' => 8706, 00224 'permil' => 8240, 00225 'perp' => 8869, 00226 'Phi' => 934, 00227 'phi' => 966, 00228 'Pi' => 928, 00229 'pi' => 960, 00230 'piv' => 982, 00231 'plusmn' => 177, 00232 'pound' => 163, 00233 'prime' => 8242, 00234 'Prime' => 8243, 00235 'prod' => 8719, 00236 'prop' => 8733, 00237 'Psi' => 936, 00238 'psi' => 968, 00239 'quot' => 34, 00240 'radic' => 8730, 00241 'rang' => 9002, 00242 'raquo' => 187, 00243 'rarr' => 8594, 00244 'rArr' => 8658, 00245 'rceil' => 8969, 00246 'rdquo' => 8221, 00247 'real' => 8476, 00248 'reg' => 174, 00249 'rfloor' => 8971, 00250 'Rho' => 929, 00251 'rho' => 961, 00252 'rlm' => 8207, 00253 'rsaquo' => 8250, 00254 'rsquo' => 8217, 00255 'sbquo' => 8218, 00256 'Scaron' => 352, 00257 'scaron' => 353, 00258 'sdot' => 8901, 00259 'sect' => 167, 00260 'shy' => 173, 00261 'Sigma' => 931, 00262 'sigma' => 963, 00263 'sigmaf' => 962, 00264 'sim' => 8764, 00265 'spades' => 9824, 00266 'sub' => 8834, 00267 'sube' => 8838, 00268 'sum' => 8721, 00269 'sup' => 8835, 00270 'sup1' => 185, 00271 'sup2' => 178, 00272 'sup3' => 179, 00273 'supe' => 8839, 00274 'szlig' => 223, 00275 'Tau' => 932, 00276 'tau' => 964, 00277 'there4' => 8756, 00278 'Theta' => 920, 00279 'theta' => 952, 00280 'thetasym' => 977, 00281 'thinsp' => 8201, 00282 'THORN' => 222, 00283 'thorn' => 254, 00284 'tilde' => 732, 00285 'times' => 215, 00286 'trade' => 8482, 00287 'Uacute' => 218, 00288 'uacute' => 250, 00289 'uarr' => 8593, 00290 'uArr' => 8657, 00291 'Ucirc' => 219, 00292 'ucirc' => 251, 00293 'Ugrave' => 217, 00294 'ugrave' => 249, 00295 'uml' => 168, 00296 'upsih' => 978, 00297 'Upsilon' => 933, 00298 'upsilon' => 965, 00299 'Uuml' => 220, 00300 'uuml' => 252, 00301 'weierp' => 8472, 00302 'Xi' => 926, 00303 'xi' => 958, 00304 'Yacute' => 221, 00305 'yacute' => 253, 00306 'yen' => 165, 00307 'Yuml' => 376, 00308 'yuml' => 255, 00309 'Zeta' => 918, 00310 'zeta' => 950, 00311 'zwj' => 8205, 00312 'zwnj' => 8204 00313 ); 00314 00318 static $htmlEntityAliases = array( 00319 'רלמ' => 'rlm', 00320 'رلم' => 'rlm', 00321 ); 00322 00326 static $attribsRegex; 00327 00333 static function getAttribsRegex() { 00334 if ( self::$attribsRegex === null ) { 00335 $attribFirst = '[:A-Z_a-z0-9]'; 00336 $attrib = '[:A-Z_a-z-.0-9]'; 00337 $space = '[\x09\x0a\x0d\x20]'; 00338 self::$attribsRegex = 00339 "/(?:^|$space)({$attribFirst}{$attrib}*) 00340 ($space*=$space* 00341 (?: 00342 # The attribute value: quoted or alone 00343 \"([^<\"]*)\" 00344 | '([^<']*)' 00345 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) 00346 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of 00347 # colors are specified like this. 00348 # We'll be normalizing it. 00349 ) 00350 )?(?=$space|\$)/sx"; 00351 } 00352 return self::$attribsRegex; 00353 } 00354 00366 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { 00367 global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag; 00368 00369 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, 00370 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; 00371 00372 wfProfileIn( __METHOD__ ); 00373 00374 // Base our staticInitialised variable off of the global config state so that if the globals 00375 // are changed (like in the secrewed up test system) we will re-initialise the settings. 00376 $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) ); 00377 if ( !$staticInitialised || $staticInitialised != $globalContext ) { 00378 00379 $htmlpairsStatic = array( # Tags that must be closed 00380 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 00381 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 00382 'strike', 'strong', 'tt', 'var', 'div', 'center', 00383 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 00384 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn', 00385 'kbd', 'samp' 00386 ); 00387 if ( $wgHtml5 ) { 00388 $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) ); 00389 } 00390 $htmlsingle = array( 00391 'br', 'hr', 'li', 'dt', 'dd' 00392 ); 00393 $htmlsingleonly = array( # Elements that cannot have close tags 00394 'br', 'hr' 00395 ); 00396 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { 00397 $htmlsingle[] = $htmlsingleonly[] = 'meta'; 00398 $htmlsingle[] = $htmlsingleonly[] = 'link'; 00399 } 00400 $htmlnest = array( # Tags that can be nested--?? 00401 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 00402 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span' 00403 ); 00404 $tabletags = array( # Can only appear inside table, we will close them 00405 'td', 'th', 'tr', 00406 ); 00407 $htmllist = array( # Tags used by list 00408 'ul','ol', 00409 ); 00410 $listtags = array( # Tags that can appear in a list 00411 'li', 00412 ); 00413 00414 if ( $wgAllowImageTag ) { 00415 $htmlsingle[] = 'img'; 00416 $htmlsingleonly[] = 'img'; 00417 } 00418 00419 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); 00420 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); 00421 00422 # Convert them all to hashtables for faster lookup 00423 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 00424 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); 00425 foreach ( $vars as $var ) { 00426 $$var = array_flip( $$var ); 00427 } 00428 $staticInitialised = $globalContext; 00429 } 00430 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays 00431 $extratags = array_flip( $extratags ); 00432 $removetags = array_flip( $removetags ); 00433 $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); 00434 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags ); 00435 00436 # Remove HTML comments 00437 $text = Sanitizer::removeHTMLcomments( $text ); 00438 $bits = explode( '<', $text ); 00439 $text = str_replace( '>', '>', array_shift( $bits ) ); 00440 if ( !$wgUseTidy ) { 00441 $tagstack = $tablestack = array(); 00442 foreach ( $bits as $x ) { 00443 $regs = array(); 00444 # $slash: Does the current element start with a '/'? 00445 # $t: Current element name 00446 # $params: String between element name and > 00447 # $brace: Ending '>' or '/>' 00448 # $rest: Everything until the next element of $bits 00449 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { 00450 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00451 } else { 00452 $slash = $t = $params = $brace = $rest = null; 00453 } 00454 00455 $badtag = false; 00456 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00457 # Check our stack 00458 if ( $slash && isset( $htmlsingleonly[$t] ) ) { 00459 $badtag = true; 00460 } elseif ( $slash ) { 00461 # Closing a tag... is it the one we just opened? 00462 $ot = @array_pop( $tagstack ); 00463 if ( $ot != $t ) { 00464 if ( isset( $htmlsingleallowed[$ot] ) ) { 00465 # Pop all elements with an optional close tag 00466 # and see if we find a match below them 00467 $optstack = array(); 00468 array_push( $optstack, $ot ); 00469 wfSuppressWarnings(); 00470 $ot = array_pop( $tagstack ); 00471 wfRestoreWarnings(); 00472 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { 00473 array_push( $optstack, $ot ); 00474 wfSuppressWarnings(); 00475 $ot = array_pop( $tagstack ); 00476 wfRestoreWarnings(); 00477 } 00478 if ( $t != $ot ) { 00479 # No match. Push the optional elements back again 00480 $badtag = true; 00481 wfSuppressWarnings(); 00482 $ot = array_pop( $optstack ); 00483 wfRestoreWarnings(); 00484 while ( $ot ) { 00485 array_push( $tagstack, $ot ); 00486 wfSuppressWarnings(); 00487 $ot = array_pop( $optstack ); 00488 wfRestoreWarnings(); 00489 } 00490 } 00491 } else { 00492 @array_push( $tagstack, $ot ); 00493 # <li> can be nested in <ul> or <ol>, skip those cases: 00494 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { 00495 $badtag = true; 00496 } 00497 } 00498 } else { 00499 if ( $t == 'table' ) { 00500 $tagstack = array_pop( $tablestack ); 00501 } 00502 } 00503 $newparams = ''; 00504 } else { 00505 # Keep track for later 00506 if ( isset( $tabletags[$t] ) && 00507 !in_array( 'table', $tagstack ) ) { 00508 $badtag = true; 00509 } elseif ( in_array( $t, $tagstack ) && 00510 !isset( $htmlnest [$t ] ) ) { 00511 $badtag = true; 00512 # Is it a self closed htmlpair ? (bug 5487) 00513 } elseif ( $brace == '/>' && 00514 isset( $htmlpairs[$t] ) ) { 00515 $badtag = true; 00516 } elseif ( isset( $htmlsingleonly[$t] ) ) { 00517 # Hack to force empty tag for uncloseable elements 00518 $brace = '/>'; 00519 } elseif ( isset( $htmlsingle[$t] ) ) { 00520 # Hack to not close $htmlsingle tags 00521 $brace = null; 00522 # Still need to push this optionally-closed tag to 00523 # the tag stack so that we can match end tags 00524 # instead of marking them as bad. 00525 array_push( $tagstack, $t ); 00526 } elseif ( isset( $tabletags[$t] ) 00527 && in_array( $t, $tagstack ) ) { 00528 // New table tag but forgot to close the previous one 00529 $text .= "</$t>"; 00530 } else { 00531 if ( $t == 'table' ) { 00532 array_push( $tablestack, $tagstack ); 00533 $tagstack = array(); 00534 } 00535 array_push( $tagstack, $t ); 00536 } 00537 00538 # Replace any variables or template parameters with 00539 # plaintext results. 00540 if( is_callable( $processCallback ) ) { 00541 call_user_func_array( $processCallback, array( &$params, $args ) ); 00542 } 00543 00544 if ( !Sanitizer::validateTag( $params, $t ) ) { 00545 $badtag = true; 00546 } 00547 00548 # Strip non-approved attributes from the tag 00549 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00550 } 00551 if ( !$badtag ) { 00552 $rest = str_replace( '>', '>', $rest ); 00553 $close = ( $brace == '/>' && !$slash ) ? ' /' : ''; 00554 $text .= "<$slash$t$newparams$close>$rest"; 00555 continue; 00556 } 00557 } 00558 $text .= '<' . str_replace( '>', '>', $x); 00559 } 00560 # Close off any remaining tags 00561 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) { 00562 $text .= "</$t>\n"; 00563 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); } 00564 } 00565 } else { 00566 # this might be possible using tidy itself 00567 foreach ( $bits as $x ) { 00568 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', 00569 $x, $regs ); 00570 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; 00571 $badtag = false; 00572 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { 00573 if( is_callable( $processCallback ) ) { 00574 call_user_func_array( $processCallback, array( &$params, $args ) ); 00575 } 00576 00577 if ( !Sanitizer::validateTag( $params, $t ) ) { 00578 $badtag = true; 00579 } 00580 00581 $newparams = Sanitizer::fixTagAttributes( $params, $t ); 00582 if ( !$badtag ) { 00583 $rest = str_replace( '>', '>', $rest ); 00584 $text .= "<$slash$t$newparams$brace$rest"; 00585 continue; 00586 } 00587 } 00588 $text .= '<' . str_replace( '>', '>', $x); 00589 } 00590 } 00591 wfProfileOut( __METHOD__ ); 00592 return $text; 00593 } 00594 00605 static function removeHTMLcomments( $text ) { 00606 wfProfileIn( __METHOD__ ); 00607 while (($start = strpos($text, '<!--')) !== false) { 00608 $end = strpos($text, '-->', $start + 4); 00609 if ($end === false) { 00610 # Unterminated comment; bail out 00611 break; 00612 } 00613 00614 $end += 3; 00615 00616 # Trim space and newline if the comment is both 00617 # preceded and followed by a newline 00618 $spaceStart = max($start - 1, 0); 00619 $spaceLen = $end - $spaceStart; 00620 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) { 00621 $spaceStart--; 00622 $spaceLen++; 00623 } 00624 while (substr($text, $spaceStart + $spaceLen, 1) === ' ') 00625 $spaceLen++; 00626 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") { 00627 # Remove the comment, leading and trailing 00628 # spaces, and leave only one newline. 00629 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1); 00630 } 00631 else { 00632 # Remove just the comment. 00633 $text = substr_replace($text, '', $start, $end - $start); 00634 } 00635 } 00636 wfProfileOut( __METHOD__ ); 00637 return $text; 00638 } 00639 00651 static function validateTag( $params, $element ) { 00652 $params = Sanitizer::decodeTagAttributes( $params ); 00653 00654 if ( $element == 'meta' || $element == 'link' ) { 00655 if ( !isset( $params['itemprop'] ) ) { 00656 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content 00657 return false; 00658 } 00659 if ( $element == 'meta' && !isset( $params['content'] ) ) { 00660 // <meta> must have a content="" for the itemprop 00661 return false; 00662 } 00663 if ( $element == 'link' && !isset( $params['href'] ) ) { 00664 // <link> must have an associated href="" 00665 return false; 00666 } 00667 } 00668 00669 return true; 00670 } 00671 00687 static function validateTagAttributes( $attribs, $element ) { 00688 return Sanitizer::validateAttributes( $attribs, 00689 Sanitizer::attributeWhitelist( $element ) ); 00690 } 00691 00707 static function validateAttributes( $attribs, $whitelist ) { 00708 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5; 00709 00710 $whitelist = array_flip( $whitelist ); 00711 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; 00712 00713 $out = array(); 00714 foreach( $attribs as $attribute => $value ) { 00715 #allow XML namespace declaration if RDFa is enabled 00716 if ( $wgAllowRdfaAttributes && preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { 00717 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00718 $out[$attribute] = $value; 00719 } 00720 00721 continue; 00722 } 00723 00724 # Allow any attribute beginning with "data-", if in HTML5 mode 00725 if ( !($wgHtml5 && preg_match( '/^data-/i', $attribute )) && !isset( $whitelist[$attribute] ) ) { 00726 continue; 00727 } 00728 00729 # Strip javascript "expression" from stylesheets. 00730 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp 00731 if( $attribute == 'style' ) { 00732 $value = Sanitizer::checkCss( $value ); 00733 } 00734 00735 if ( $attribute === 'id' ) { 00736 $value = Sanitizer::escapeId( $value, 'noninitial' ); 00737 } 00738 00739 //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity 00740 if ( $attribute === 'rel' || $attribute === 'rev' || 00741 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa 00742 $attribute === 'datatype' || $attribute === 'typeof' || #RDFa 00743 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata 00744 $attribute === 'itemscope' || $attribute === 'itemtype' ) { #HTML5 microdata 00745 00746 //Paranoia. Allow "simple" values but suppress javascript 00747 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { 00748 continue; 00749 } 00750 } 00751 00752 # NOTE: even though elements using href/src are not allowed directly, supply 00753 # validation code that can be used by tag hook handlers, etc 00754 if ( $attribute === 'href' || $attribute === 'src' ) { 00755 if ( !preg_match( $hrefExp, $value ) ) { 00756 continue; //drop any href or src attributes not using an allowed protocol. 00757 //NOTE: this also drops all relative URLs 00758 } 00759 } 00760 00761 // If this attribute was previously set, override it. 00762 // Output should only have one attribute of each name. 00763 $out[$attribute] = $value; 00764 } 00765 00766 if ( $wgAllowMicrodataAttributes ) { 00767 # itemtype, itemid, itemref don't make sense without itemscope 00768 if ( !array_key_exists( 'itemscope', $out ) ) { 00769 unset( $out['itemtype'] ); 00770 unset( $out['itemid'] ); 00771 unset( $out['itemref'] ); 00772 } 00773 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. 00774 } 00775 return $out; 00776 } 00777 00788 static function mergeAttributes( $a, $b ) { 00789 $out = array_merge( $a, $b ); 00790 if( isset( $a['class'] ) && isset( $b['class'] ) 00791 && is_string( $a['class'] ) && is_string( $b['class'] ) 00792 && $a['class'] !== $b['class'] ) { 00793 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", 00794 -1, PREG_SPLIT_NO_EMPTY ); 00795 $out['class'] = implode( ' ', array_unique( $classes ) ); 00796 } 00797 return $out; 00798 } 00799 00817 static function checkCss( $value ) { 00818 // Decode character references like { 00819 $value = Sanitizer::decodeCharReferences( $value ); 00820 00821 // Decode escape sequences and line continuation 00822 // See the grammar in the CSS 2 spec, appendix D. 00823 // This has to be done AFTER decoding character references. 00824 // This means it isn't possible for this function to return 00825 // unsanitized escape sequences. It is possible to manufacture 00826 // input that contains character references that decode to 00827 // escape sequences that decode to character references, but 00828 // it's OK for the return value to contain character references 00829 // because the caller is supposed to escape those anyway. 00830 static $decodeRegex; 00831 if ( !$decodeRegex ) { 00832 $space = '[\\x20\\t\\r\\n\\f]'; 00833 $nl = '(?:\\n|\\r\\n|\\r|\\f)'; 00834 $backslash = '\\\\'; 00835 $decodeRegex = "/ $backslash 00836 (?: 00837 ($nl) | # 1. Line continuation 00838 ([0-9A-Fa-f]{1,6})$space? | # 2. character number 00839 (.) | # 3. backslash cancelling special meaning 00840 () | # 4. backslash at end of string 00841 )/xu"; 00842 } 00843 $value = preg_replace_callback( $decodeRegex, 00844 array( __CLASS__, 'cssDecodeCallback' ), $value ); 00845 00846 // Remove any comments; IE gets token splitting wrong 00847 // This must be done AFTER decoding character references and 00848 // escape sequences, because those steps can introduce comments 00849 // This step cannot introduce character references or escape 00850 // sequences, because it replaces comments with spaces rather 00851 // than removing them completely. 00852 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); 00853 00854 // Remove anything after a comment-start token, to guard against 00855 // incorrect client implementations. 00856 $commentPos = strpos( $value, '/*' ); 00857 if ( $commentPos !== false ) { 00858 $value = substr( $value, 0, $commentPos ); 00859 } 00860 00861 // Reject problematic keywords and control characters 00862 if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) { 00863 return '/* invalid control char */'; 00864 } elseif ( preg_match( '! expression | filter\s*: | accelerator\s*: | url\s*\( | image\s*\( | image-set\s*\( !ix', $value ) ) { 00865 return '/* insecure input */'; 00866 } 00867 return $value; 00868 } 00869 00874 static function cssDecodeCallback( $matches ) { 00875 if ( $matches[1] !== '' ) { 00876 // Line continuation 00877 return ''; 00878 } elseif ( $matches[2] !== '' ) { 00879 $char = codepointToUtf8( hexdec( $matches[2] ) ); 00880 } elseif ( $matches[3] !== '' ) { 00881 $char = $matches[3]; 00882 } else { 00883 $char = '\\'; 00884 } 00885 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { 00886 // These characters need to be escaped in strings 00887 // Clean up the escape sequence to avoid parsing errors by clients 00888 return '\\' . dechex( ord( $char ) ) . ' '; 00889 } else { 00890 // Decode unnecessary escape 00891 return $char; 00892 } 00893 } 00894 00914 static function fixTagAttributes( $text, $element ) { 00915 if( trim( $text ) == '' ) { 00916 return ''; 00917 } 00918 00919 $decoded = Sanitizer::decodeTagAttributes( $text ); 00920 $stripped = Sanitizer::validateTagAttributes( $decoded, $element ); 00921 00922 $attribs = array(); 00923 foreach( $stripped as $attribute => $value ) { 00924 $encAttribute = htmlspecialchars( $attribute ); 00925 $encValue = Sanitizer::safeEncodeAttribute( $value ); 00926 00927 $attribs[] = "$encAttribute=\"$encValue\""; 00928 } 00929 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; 00930 } 00931 00937 static function encodeAttribute( $text ) { 00938 $encValue = htmlspecialchars( $text, ENT_QUOTES ); 00939 00940 // Whitespace is normalized during attribute decoding, 00941 // so if we've been passed non-spaces we must encode them 00942 // ahead of time or they won't be preserved. 00943 $encValue = strtr( $encValue, array( 00944 "\n" => ' ', 00945 "\r" => ' ', 00946 "\t" => '	', 00947 ) ); 00948 00949 return $encValue; 00950 } 00951 00958 static function safeEncodeAttribute( $text ) { 00959 $encValue = Sanitizer::encodeAttribute( $text ); 00960 00961 # Templates and links may be expanded in later parsing, 00962 # creating invalid or dangerous output. Suppress this. 00963 $encValue = strtr( $encValue, array( 00964 '<' => '<', // This should never happen, 00965 '>' => '>', // we've received invalid input 00966 '"' => '"', // which should have been escaped. 00967 '{' => '{', 00968 '[' => '[', 00969 "''" => '''', 00970 'ISBN' => 'ISBN', 00971 'RFC' => 'RFC', 00972 'PMID' => 'PMID', 00973 '|' => '|', 00974 '__' => '__', 00975 ) ); 00976 00977 # Stupid hack 00978 $encValue = preg_replace_callback( 00979 '/((?i)' . wfUrlProtocols() . ')/', 00980 array( 'Sanitizer', 'armorLinksCallback' ), 00981 $encValue ); 00982 return $encValue; 00983 } 00984 01016 static function escapeId( $id, $options = array() ) { 01017 global $wgHtml5, $wgExperimentalHtmlIds; 01018 $options = (array)$options; 01019 01020 if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { 01021 $id = Sanitizer::decodeCharReferences( $id ); 01022 $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); 01023 $id = trim( $id, '_' ); 01024 if ( $id === '' ) { 01025 # Must have been all whitespace to start with. 01026 return '_'; 01027 } else { 01028 return $id; 01029 } 01030 } 01031 01032 # HTML4-style escaping 01033 static $replace = array( 01034 '%3A' => ':', 01035 '%' => '.' 01036 ); 01037 01038 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) ); 01039 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); 01040 01041 if ( !preg_match( '/^[a-zA-Z]/', $id ) 01042 && !in_array( 'noninitial', $options ) ) { 01043 // Initial character must be a letter! 01044 $id = "x$id"; 01045 } 01046 return $id; 01047 } 01048 01060 static function escapeClass( $class ) { 01061 // Convert ugly stuff to underscores and kill underscores in ugly places 01062 return rtrim(preg_replace( 01063 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'), 01064 '_', 01065 $class ), '_'); 01066 } 01067 01075 static function escapeHtmlAllowEntities( $html ) { 01076 $html = Sanitizer::decodeCharReferences( $html ); 01077 # It seems wise to escape ' as well as ", as a matter of course. Can't 01078 # hurt. 01079 $html = htmlspecialchars( $html, ENT_QUOTES ); 01080 return $html; 01081 } 01082 01088 private static function armorLinksCallback( $matches ) { 01089 return str_replace( ':', ':', $matches[1] ); 01090 } 01091 01100 public static function decodeTagAttributes( $text ) { 01101 if( trim( $text ) == '' ) { 01102 return array(); 01103 } 01104 01105 $attribs = array(); 01106 $pairs = array(); 01107 if( !preg_match_all( 01108 self::getAttribsRegex(), 01109 $text, 01110 $pairs, 01111 PREG_SET_ORDER ) ) { 01112 return $attribs; 01113 } 01114 01115 foreach( $pairs as $set ) { 01116 $attribute = strtolower( $set[1] ); 01117 $value = Sanitizer::getTagAttributeCallback( $set ); 01118 01119 // Normalize whitespace 01120 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); 01121 $value = trim( $value ); 01122 01123 // Decode character references 01124 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); 01125 } 01126 return $attribs; 01127 } 01128 01137 private static function getTagAttributeCallback( $set ) { 01138 if( isset( $set[6] ) ) { 01139 # Illegal #XXXXXX color with no quotes. 01140 return $set[6]; 01141 } elseif( isset( $set[5] ) ) { 01142 # No quotes. 01143 return $set[5]; 01144 } elseif( isset( $set[4] ) ) { 01145 # Single-quoted 01146 return $set[4]; 01147 } elseif( isset( $set[3] ) ) { 01148 # Double-quoted 01149 return $set[3]; 01150 } elseif( !isset( $set[2] ) ) { 01151 # In XHTML, attributes must have a value. 01152 # For 'reduced' form, return explicitly the attribute name here. 01153 return $set[1]; 01154 } else { 01155 throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); 01156 } 01157 } 01158 01170 private static function normalizeAttributeValue( $text ) { 01171 return str_replace( '"', '"', 01172 self::normalizeWhitespace( 01173 Sanitizer::normalizeCharReferences( $text ) ) ); 01174 } 01175 01180 private static function normalizeWhitespace( $text ) { 01181 return preg_replace( 01182 '/\r\n|[\x20\x0d\x0a\x09]/', 01183 ' ', 01184 $text ); 01185 } 01186 01195 static function normalizeSectionNameWhitespace( $section ) { 01196 return trim( preg_replace( '/[ _]+/', ' ', $section ) ); 01197 } 01198 01214 static function normalizeCharReferences( $text ) { 01215 return preg_replace_callback( 01216 self::CHAR_REFS_REGEX, 01217 array( 'Sanitizer', 'normalizeCharReferencesCallback' ), 01218 $text ); 01219 } 01224 static function normalizeCharReferencesCallback( $matches ) { 01225 $ret = null; 01226 if( $matches[1] != '' ) { 01227 $ret = Sanitizer::normalizeEntity( $matches[1] ); 01228 } elseif( $matches[2] != '' ) { 01229 $ret = Sanitizer::decCharReference( $matches[2] ); 01230 } elseif( $matches[3] != '' ) { 01231 $ret = Sanitizer::hexCharReference( $matches[3] ); 01232 } 01233 if( is_null( $ret ) ) { 01234 return htmlspecialchars( $matches[0] ); 01235 } else { 01236 return $ret; 01237 } 01238 } 01239 01250 static function normalizeEntity( $name ) { 01251 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01252 return '&' . self::$htmlEntityAliases[$name] . ';'; 01253 } elseif ( in_array( $name, 01254 array( 'lt', 'gt', 'amp', 'quot' ) ) ) { 01255 return "&$name;"; 01256 } elseif ( isset( self::$htmlEntities[$name] ) ) { 01257 return '&#' . self::$htmlEntities[$name] . ';'; 01258 } else { 01259 return "&$name;"; 01260 } 01261 } 01262 01267 static function decCharReference( $codepoint ) { 01268 $point = intval( $codepoint ); 01269 if( Sanitizer::validateCodepoint( $point ) ) { 01270 return sprintf( '&#%d;', $point ); 01271 } else { 01272 return null; 01273 } 01274 } 01275 01280 static function hexCharReference( $codepoint ) { 01281 $point = hexdec( $codepoint ); 01282 if( Sanitizer::validateCodepoint( $point ) ) { 01283 return sprintf( '&#x%x;', $point ); 01284 } else { 01285 return null; 01286 } 01287 } 01288 01294 private static function validateCodepoint( $codepoint ) { 01295 return ($codepoint == 0x09) 01296 || ($codepoint == 0x0a) 01297 || ($codepoint == 0x0d) 01298 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff) 01299 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd) 01300 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff); 01301 } 01302 01310 public static function decodeCharReferences( $text ) { 01311 return preg_replace_callback( 01312 self::CHAR_REFS_REGEX, 01313 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01314 $text ); 01315 } 01316 01327 public static function decodeCharReferencesAndNormalize( $text ) { 01328 global $wgContLang; 01329 $text = preg_replace_callback( 01330 self::CHAR_REFS_REGEX, 01331 array( 'Sanitizer', 'decodeCharReferencesCallback' ), 01332 $text, /* limit */ -1, $count ); 01333 01334 if ( $count ) { 01335 return $wgContLang->normalize( $text ); 01336 } else { 01337 return $text; 01338 } 01339 } 01340 01345 static function decodeCharReferencesCallback( $matches ) { 01346 if( $matches[1] != '' ) { 01347 return Sanitizer::decodeEntity( $matches[1] ); 01348 } elseif( $matches[2] != '' ) { 01349 return Sanitizer::decodeChar( intval( $matches[2] ) ); 01350 } elseif( $matches[3] != '' ) { 01351 return Sanitizer::decodeChar( hexdec( $matches[3] ) ); 01352 } 01353 # Last case should be an ampersand by itself 01354 return $matches[0]; 01355 } 01356 01364 static function decodeChar( $codepoint ) { 01365 if( Sanitizer::validateCodepoint( $codepoint ) ) { 01366 return codepointToUtf8( $codepoint ); 01367 } else { 01368 return UTF8_REPLACEMENT; 01369 } 01370 } 01371 01380 static function decodeEntity( $name ) { 01381 if ( isset( self::$htmlEntityAliases[$name] ) ) { 01382 $name = self::$htmlEntityAliases[$name]; 01383 } 01384 if( isset( self::$htmlEntities[$name] ) ) { 01385 return codepointToUtf8( self::$htmlEntities[$name] ); 01386 } else { 01387 return "&$name;"; 01388 } 01389 } 01390 01397 static function attributeWhitelist( $element ) { 01398 $list = Sanitizer::setupAttributeWhitelist(); 01399 return isset( $list[$element] ) 01400 ? $list[$element] 01401 : array(); 01402 } 01403 01409 static function setupAttributeWhitelist() { 01410 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes; 01411 01412 static $whitelist, $staticInitialised; 01413 $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) ); 01414 01415 if ( isset( $whitelist ) && $staticInitialised == $globalContext ) { 01416 return $whitelist; 01417 } 01418 01419 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' ); 01420 01421 if ( $wgAllowRdfaAttributes ) { 01422 #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 01423 $common = array_merge( $common, array( 01424 'about', 'property', 'resource', 'datatype', 'typeof', 01425 ) ); 01426 } 01427 01428 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) { 01429 # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model 01430 $common = array_merge( $common, array( 01431 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype' 01432 ) ); 01433 } 01434 01435 $block = array_merge( $common, array( 'align' ) ); 01436 $tablealign = array( 'align', 'char', 'charoff', 'valign' ); 01437 $tablecell = array( 'abbr', 01438 'axis', 01439 'headers', 01440 'scope', 01441 'rowspan', 01442 'colspan', 01443 'nowrap', # deprecated 01444 'width', # deprecated 01445 'height', # deprecated 01446 'bgcolor' # deprecated 01447 ); 01448 01449 # Numbers refer to sections in HTML 4.01 standard describing the element. 01450 # See: http://www.w3.org/TR/html4/ 01451 $whitelist = array( 01452 # 7.5.4 01453 'div' => $block, 01454 'center' => $common, # deprecated 01455 'span' => $block, # ?? 01456 01457 # 7.5.5 01458 'h1' => $block, 01459 'h2' => $block, 01460 'h3' => $block, 01461 'h4' => $block, 01462 'h5' => $block, 01463 'h6' => $block, 01464 01465 # 7.5.6 01466 # address 01467 01468 # 8.2.4 01469 # bdo 01470 01471 # 9.2.1 01472 'em' => $common, 01473 'strong' => $common, 01474 'cite' => $common, 01475 'dfn' => $common, 01476 'code' => $common, 01477 'samp' => $common, 01478 'kbd' => $common, 01479 'var' => $common, 01480 'abbr' => $common, 01481 # acronym 01482 01483 # 9.2.2 01484 'blockquote' => array_merge( $common, array( 'cite' ) ), 01485 # q 01486 01487 # 9.2.3 01488 'sub' => $common, 01489 'sup' => $common, 01490 01491 # 9.3.1 01492 'p' => $block, 01493 01494 # 9.3.2 01495 'br' => array( 'id', 'class', 'title', 'style', 'clear' ), 01496 01497 # 9.3.4 01498 'pre' => array_merge( $common, array( 'width' ) ), 01499 01500 # 9.4 01501 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ), 01502 'del' => array_merge( $common, array( 'cite', 'datetime' ) ), 01503 01504 # 10.2 01505 'ul' => array_merge( $common, array( 'type' ) ), 01506 'ol' => array_merge( $common, array( 'type', 'start' ) ), 01507 'li' => array_merge( $common, array( 'type', 'value' ) ), 01508 01509 # 10.3 01510 'dl' => $common, 01511 'dd' => $common, 01512 'dt' => $common, 01513 01514 # 11.2.1 01515 'table' => array_merge( $common, 01516 array( 'summary', 'width', 'border', 'frame', 01517 'rules', 'cellspacing', 'cellpadding', 01518 'align', 'bgcolor', 01519 ) ), 01520 01521 # 11.2.2 01522 'caption' => array_merge( $common, array( 'align' ) ), 01523 01524 # 11.2.3 01525 'thead' => array_merge( $common, $tablealign ), 01526 'tfoot' => array_merge( $common, $tablealign ), 01527 'tbody' => array_merge( $common, $tablealign ), 01528 01529 # 11.2.4 01530 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01531 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ), 01532 01533 # 11.2.5 01534 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ), 01535 01536 # 11.2.6 01537 'td' => array_merge( $common, $tablecell, $tablealign ), 01538 'th' => array_merge( $common, $tablecell, $tablealign ), 01539 01540 # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object 01541 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 01542 01543 # 13.2 01544 # Not usually allowed, but may be used for extension-style hooks 01545 # such as <math> when it is rasterized, or if $wgAllowImageTag is 01546 # true 01547 'img' => array_merge( $common, array( 'alt', 'src', 'width', 'height' ) ), 01548 01549 # 15.2.1 01550 'tt' => $common, 01551 'b' => $common, 01552 'i' => $common, 01553 'big' => $common, 01554 'small' => $common, 01555 'strike' => $common, 01556 's' => $common, 01557 'u' => $common, 01558 01559 # 15.2.2 01560 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ), 01561 # basefont 01562 01563 # 15.3 01564 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ), 01565 01566 # XHTML Ruby annotation text module, simple ruby only. 01567 # http://www.w3c.org/TR/ruby/ 01568 'ruby' => $common, 01569 # rbc 01570 # rtc 01571 'rb' => $common, 01572 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ), 01573 'rp' => $common, 01574 01575 # MathML root element, where used for extensions 01576 # 'title' may not be 100% valid here; it's XHTML 01577 # http://www.w3.org/TR/REC-MathML/ 01578 'math' => array( 'class', 'style', 'id', 'title' ), 01579 01580 # HTML 5 section 4.6 01581 'bdi' => $common, 01582 01583 ); 01584 01585 if ( $wgHtml5 ) { 01586 # HTML5 elements, defined by: 01587 # http://www.whatwg.org/specs/web-apps/current-work/multipage/ 01588 $whitelist += array( 01589 'data' => array_merge( $common, array( 'value' ) ), 01590 'time' => array_merge( $common, array( 'datetime' ) ), 01591 'mark' => $common, 01592 01593 // meta and link are only permitted by removeHTMLtags when Microdata 01594 // is enabled so we don't bother adding a conditional to hide these 01595 // Also meta and link are only valid in WikiText as Microdata elements 01596 // (ie: validateTag rejects tags missing the attributes needed for Microdata) 01597 // So we don't bother including $common attributes that have no purpose. 01598 'meta' => array( 'itemprop', 'content' ), 01599 'link' => array( 'itemprop', 'href' ), 01600 ); 01601 } 01602 01603 $staticInitialised = $globalContext; 01604 01605 return $whitelist; 01606 } 01607 01618 static function stripAllTags( $text ) { 01619 # Actual <tags> 01620 $text = StringUtils::delimiterReplace( '<', '>', '', $text ); 01621 01622 # Normalize &entities and whitespace 01623 $text = self::decodeCharReferences( $text ); 01624 $text = self::normalizeWhitespace( $text ); 01625 01626 return $text; 01627 } 01628 01638 static function hackDocType() { 01639 $out = "<!DOCTYPE html [\n"; 01640 foreach( self::$htmlEntities as $entity => $codepoint ) { 01641 $out .= "<!ENTITY $entity \"&#$codepoint;\">"; 01642 } 01643 $out .= "]>\n"; 01644 return $out; 01645 } 01646 01651 static function cleanUrl( $url ) { 01652 # Normalize any HTML entities in input. They will be 01653 # re-escaped by makeExternalLink(). 01654 $url = Sanitizer::decodeCharReferences( $url ); 01655 01656 # Escape any control characters introduced by the above step 01657 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 01658 array( __CLASS__, 'cleanUrlCallback' ), $url ); 01659 01660 # Validate hostname portion 01661 $matches = array(); 01662 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { 01663 list( /* $whole */, $protocol, $host, $rest ) = $matches; 01664 01665 // Characters that will be ignored in IDNs. 01666 // http://tools.ietf.org/html/3454#section-3.1 01667 // Strip them before further processing so blacklists and such work. 01668 $strip = "/ 01669 \\s| # general whitespace 01670 \xc2\xad| # 00ad SOFT HYPHEN 01671 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN 01672 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE 01673 \xe2\x81\xa0| # 2060 WORD JOINER 01674 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE 01675 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER 01676 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE 01677 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO 01678 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE 01679 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER 01680 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER 01681 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 01682 /xuD"; 01683 01684 $host = preg_replace( $strip, '', $host ); 01685 01686 // @todo FIXME: Validate hostnames here 01687 01688 return $protocol . $host . $rest; 01689 } else { 01690 return $url; 01691 } 01692 } 01693 01698 static function cleanUrlCallback( $matches ) { 01699 return urlencode( $matches[0] ); 01700 } 01701 01730 public static function validateEmail( $addr ) { 01731 $result = null; 01732 if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) { 01733 return $result; 01734 } 01735 01736 // Please note strings below are enclosed in brackets [], this make the 01737 // hyphen "-" a range indicator. Hence it is double backslashed below. 01738 // See bug 26948 01739 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ; 01740 $rfc1034_ldh_str = "a-z0-9\\-" ; 01741 01742 $HTML5_email_regexp = "/ 01743 ^ # start of string 01744 [$rfc5322_atext\\.]+ # user part which is liberal :p 01745 @ # 'apostrophe' 01746 [$rfc1034_ldh_str]+ # First domain part 01747 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot 01748 $ # End of string 01749 /ix" ; // case Insensitive, eXtended 01750 01751 return (bool) preg_match( $HTML5_email_regexp, $addr ); 01752 } 01753 }