MediaWiki  master
SearchEngine.php
Go to the documentation of this file.
00001 <?php
00032 class SearchEngine {
00033         var $limit = 10;
00034         var $offset = 0;
00035         var $prefix = '';
00036         var $searchTerms = array();
00037         var $namespaces = array( NS_MAIN );
00038         var $showRedirects = false;
00039 
00041         protected $features = array();
00042 
00046         protected $db;
00047 
00048         function __construct($db = null) {
00049                 if ( $db ) {
00050                         $this->db = $db;
00051                 } else {
00052                         $this->db = wfGetDB( DB_SLAVE );
00053                 }
00054         }
00055 
00064         function searchText( $term ) {
00065                 return null;
00066         }
00067 
00076         function searchTitle( $term ) {
00077                 return null;
00078         }
00079 
00085         function acceptListRedirects() {
00086                 wfDeprecated( __METHOD__, '1.18' );
00087                 return $this->supports( 'list-redirects' );
00088         }
00089 
00095         public function supports( $feature ) {
00096                 switch( $feature ) {
00097                 case 'list-redirects':
00098                         return true;
00099                 case 'title-suffix-filter':
00100                 default:
00101                         return false;
00102                 }
00103         }
00104 
00112         public function setFeatureData( $feature, $data ) {
00113                 $this->features[$feature] = $data;
00114         }
00115 
00124         public function normalizeText( $string ) {
00125                 global $wgContLang;
00126 
00127                 // Some languages such as Chinese require word segmentation
00128                 return $wgContLang->segmentByWord( $string );
00129         }
00130 
00135         function transformSearchTerm( $term ) {
00136                 return $term;
00137         }
00138 
00146         public static function getNearMatch( $searchterm ) {
00147                 $title = self::getNearMatchInternal( $searchterm );
00148 
00149                 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
00150                 return $title;
00151         }
00152 
00160         public static function getNearMatchResultSet( $searchterm ) {
00161                 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) );
00162         }
00163 
00168         private static function getNearMatchInternal( $searchterm ) {
00169                 global $wgContLang, $wgEnableSearchContributorsByIP;
00170 
00171                 $allSearchTerms = array( $searchterm );
00172 
00173                 if ( $wgContLang->hasVariants() ) {
00174                         $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) );
00175                 }
00176 
00177                 $titleResult = null;
00178                 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
00179                         return $titleResult;
00180                 }
00181 
00182                 foreach ( $allSearchTerms as $term ) {
00183 
00184                         # Exact match? No need to look further.
00185                         $title = Title::newFromText( $term );
00186                         if ( is_null( $title ) ){
00187                                 return null;
00188                         }
00189 
00190                         if ( $title->isSpecialPage() || $title->isExternal() || $title->exists() ) {
00191                                 return $title;
00192                         }
00193 
00194                         # See if it still otherwise has content is some sane sense
00195                         $page = WikiPage::factory( $title );
00196                         if ( $page->hasViewableContent() ) {
00197                                 return $title;
00198                         }
00199 
00200                         # Now try all lower case (i.e. first letter capitalized)
00201                         #
00202                         $title = Title::newFromText( $wgContLang->lc( $term ) );
00203                         if ( $title && $title->exists() ) {
00204                                 return $title;
00205                         }
00206 
00207                         # Now try capitalized string
00208                         #
00209                         $title = Title::newFromText( $wgContLang->ucwords( $term ) );
00210                         if ( $title && $title->exists() ) {
00211                                 return $title;
00212                         }
00213 
00214                         # Now try all upper case
00215                         #
00216                         $title = Title::newFromText( $wgContLang->uc( $term ) );
00217                         if ( $title && $title->exists() ) {
00218                                 return $title;
00219                         }
00220 
00221                         # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
00222                         $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) );
00223                         if ( $title && $title->exists() ) {
00224                                 return $title;
00225                         }
00226 
00227                         // Give hooks a chance at better match variants
00228                         $title = null;
00229                         if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
00230                                 return $title;
00231                         }
00232                 }
00233 
00234                 $title = Title::newFromText( $searchterm );
00235 
00236 
00237                 # Entering an IP address goes to the contributions page
00238                 if ( $wgEnableSearchContributorsByIP ) {
00239                         if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) )
00240                                 || User::isIP( trim( $searchterm ) ) ) {
00241                                 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
00242                         }
00243                 }
00244 
00245 
00246                 # Entering a user goes to the user page whether it's there or not
00247                 if ( $title->getNamespace() == NS_USER ) {
00248                         return $title;
00249                 }
00250 
00251                 # Go to images that exist even if there's no local page.
00252                 # There may have been a funny upload, or it may be on a shared
00253                 # file repository such as Wikimedia Commons.
00254                 if ( $title->getNamespace() == NS_FILE ) {
00255                         $image = wfFindFile( $title );
00256                         if ( $image ) {
00257                                 return $title;
00258                         }
00259                 }
00260 
00261                 # MediaWiki namespace? Page may be "implied" if not customized.
00262                 # Just return it, with caps forced as the message system likes it.
00263                 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
00264                         return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
00265                 }
00266 
00267                 # Quoted term? Try without the quotes...
00268                 $matches = array();
00269                 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
00270                         return SearchEngine::getNearMatch( $matches[1] );
00271                 }
00272 
00273                 return null;
00274         }
00275 
00276         public static function legalSearchChars() {
00277                 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
00278         }
00279 
00287         function setLimitOffset( $limit, $offset = 0 ) {
00288                 $this->limit = intval( $limit );
00289                 $this->offset = intval( $offset );
00290         }
00291 
00298         function setNamespaces( $namespaces ) {
00299                 $this->namespaces = $namespaces;
00300         }
00301 
00309         function replacePrefixes( $query ) {
00310                 global $wgContLang;
00311 
00312                 $parsed = $query;
00313                 if ( strpos( $query, ':' ) === false ) { // nothing to do
00314                         wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00315                         return $parsed;
00316                 }
00317 
00318                 $allkeyword = wfMessage( 'searchall' )->inContentLanguage()->text() . ":";
00319                 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) {
00320                         $this->namespaces = null;
00321                         $parsed = substr( $query, strlen( $allkeyword ) );
00322                 } elseif ( strpos( $query, ':' ) !== false ) {
00323                         $prefix = substr( $query, 0, strpos( $query, ':' ) );
00324                         $index = $wgContLang->getNsIndex( $prefix );
00325                         if ( $index !== false ) {
00326                                 $this->namespaces = array( $index );
00327                                 $parsed = substr( $query, strlen( $prefix ) + 1 );
00328                         }
00329                 }
00330                 if ( trim( $parsed ) == '' )
00331                         $parsed = $query; // prefix was the whole query
00332 
00333                 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
00334 
00335                 return $parsed;
00336         }
00337 
00342         public static function searchableNamespaces() {
00343                 global $wgContLang;
00344                 $arr = array();
00345                 foreach ( $wgContLang->getNamespaces() as $ns => $name ) {
00346                         if ( $ns >= NS_MAIN ) {
00347                                 $arr[$ns] = $name;
00348                         }
00349                 }
00350 
00351                 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
00352                 return $arr;
00353         }
00354 
00362         public static function userNamespaces( $user ) {
00363                 global $wgSearchEverythingOnlyLoggedIn;
00364 
00365                 $searchableNamespaces = SearchEngine::searchableNamespaces();
00366 
00367                 // get search everything preference, that can be set to be read for logged-in users
00368                 // it overrides other options
00369                 if ( !$wgSearchEverythingOnlyLoggedIn || $user->isLoggedIn() ) {
00370                         if ( $user->getOption( 'searcheverything' ) ) {
00371                                 return array_keys( $searchableNamespaces );
00372                         }
00373                 }
00374 
00375                 $arr = array();
00376                 foreach ( $searchableNamespaces as $ns => $name ) {
00377                         if ( $user->getOption( 'searchNs' . $ns ) ) {
00378                                 $arr[] = $ns;
00379                         }
00380                 }
00381 
00382                 return $arr;
00383         }
00384 
00390         public static function userHighlightPrefs() {
00391                 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
00392                 $contextchars = 75; // same as above.... :P
00393                 return array( $contextlines, $contextchars );
00394         }
00395 
00401         public static function defaultNamespaces() {
00402                 global $wgNamespacesToBeSearchedDefault;
00403 
00404                 return array_keys( $wgNamespacesToBeSearchedDefault, true );
00405         }
00406 
00414         public static function namespacesAsText( $namespaces ) {
00415                 global $wgContLang;
00416 
00417                 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces );
00418                 foreach ( $formatted as $key => $ns ) {
00419                         if ( empty( $ns ) )
00420                                 $formatted[$key] = wfMessage( 'blanknamespace' )->text();
00421                 }
00422                 return $formatted;
00423         }
00424 
00430         public static function helpNamespaces() {
00431                 global $wgNamespacesToBeSearchedHelp;
00432 
00433                 return array_keys( $wgNamespacesToBeSearchedHelp, true );
00434         }
00435 
00442         function filter( $text ) {
00443                 $lc = $this->legalSearchChars();
00444                 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
00445         }
00452         public static function create() {
00453                 global $wgSearchType;
00454                 $dbr = null;
00455                 if ( $wgSearchType ) {
00456                         $class = $wgSearchType;
00457                 } else {
00458                         $dbr = wfGetDB( DB_SLAVE );
00459                         $class = $dbr->getSearchEngine();
00460                 }
00461                 $search = new $class( $dbr );
00462                 $search->setLimitOffset( 0, 0 );
00463                 return $search;
00464         }
00465 
00475         function update( $id, $title, $text ) {
00476                 // no-op
00477         }
00478 
00487         function updateTitle( $id, $title ) {
00488                 // no-op
00489         }
00490 
00496         public static function getOpenSearchTemplate() {
00497                 global $wgOpenSearchTemplate, $wgCanonicalServer;
00498                 if ( $wgOpenSearchTemplate ) {
00499                         return $wgOpenSearchTemplate;
00500                 } else {
00501                         $ns = implode( '|', SearchEngine::defaultNamespaces() );
00502                         if ( !$ns ) {
00503                                 $ns = "0";
00504                         }
00505                         return $wgCanonicalServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns;
00506                 }
00507         }
00508 }
00509 
00513 class SearchResultSet {
00521         function termMatches() {
00522                 return array();
00523         }
00524 
00525         function numRows() {
00526                 return 0;
00527         }
00528 
00535         function hasResults() {
00536                 return false;
00537         }
00538 
00549         function getTotalHits() {
00550                 return null;
00551         }
00552 
00559         function hasSuggestion() {
00560                 return false;
00561         }
00562 
00566         function getSuggestionQuery() {
00567                 return null;
00568         }
00569 
00573         function getSuggestionSnippet() {
00574                 return '';
00575         }
00576 
00583         function getInfo() {
00584                 return null;
00585         }
00586 
00592         function getInterwikiResults() {
00593                 return null;
00594         }
00595 
00601         function hasInterwikiResults() {
00602                 return $this->getInterwikiResults() != null;
00603         }
00604 
00611         function next() {
00612                 return false;
00613         }
00614 
00618         function free() {
00619                 // ...
00620         }
00621 }
00622 
00626 class SqlSearchResultSet extends SearchResultSet {
00627 
00628         protected $mResultSet;
00629 
00630         function __construct( $resultSet, $terms ) {
00631                 $this->mResultSet = $resultSet;
00632                 $this->mTerms = $terms;
00633         }
00634 
00635         function termMatches() {
00636                 return $this->mTerms;
00637         }
00638 
00639         function numRows() {
00640                 if ( $this->mResultSet === false )
00641                         return false;
00642 
00643                 return $this->mResultSet->numRows();
00644         }
00645 
00646         function next() {
00647                 if ( $this->mResultSet === false )
00648                         return false;
00649 
00650                 $row = $this->mResultSet->fetchObject();
00651                 if ( $row === false )
00652                         return false;
00653 
00654                 return SearchResult::newFromRow( $row );
00655         }
00656 
00657         function free() {
00658                 if ( $this->mResultSet === false )
00659                         return false;
00660 
00661                 $this->mResultSet->free();
00662         }
00663 }
00664 
00668 class SearchResultTooMany {
00669         # # Some search engines may bail out if too many matches are found
00670 }
00671 
00672 
00679 class SearchResult {
00680 
00684         var $mRevision = null;
00685         var $mImage = null;
00686 
00690         var $mTitle;
00691 
00695         var $mText;
00696 
00703         public static function newFromTitle( $title ) {
00704                 $result = new self();
00705                 $result->initFromTitle( $title );
00706                 return $result;
00707         }
00714         public static function newFromRow( $row ) {
00715                 $result = new self();
00716                 $result->initFromRow( $row );
00717                 return $result;
00718         }
00719 
00720         public function __construct( $row = null ) {
00721                 if ( !is_null( $row ) ) {
00722                         // Backwards compatibility with pre-1.17 callers
00723                         $this->initFromRow( $row );
00724                 }
00725         }
00726 
00733         protected function initFromRow( $row ) {
00734                 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) );
00735         }
00736 
00743         protected function initFromTitle( $title ) {
00744                 $this->mTitle = $title;
00745                 if ( !is_null( $this->mTitle ) ) {
00746                         $id = false;
00747                         wfRunHooks( 'SearchResultInitFromTitle', array( $title, &$id ) );
00748                         $this->mRevision = Revision::newFromTitle(
00749                                 $this->mTitle, $id, Revision::READ_NORMAL );
00750                         if ( $this->mTitle->getNamespace() === NS_FILE )
00751                                 $this->mImage = wfFindFile( $this->mTitle );
00752                 }
00753         }
00754 
00760         function isBrokenTitle() {
00761                 if ( is_null( $this->mTitle ) )
00762                         return true;
00763                 return false;
00764         }
00765 
00771         function isMissingRevision() {
00772                 return !$this->mRevision && !$this->mImage;
00773         }
00774 
00778         function getTitle() {
00779                 return $this->mTitle;
00780         }
00781 
00785         function getScore() {
00786                 return null;
00787         }
00788 
00792         protected function initText() {
00793                 if ( !isset( $this->mText ) ) {
00794                         if ( $this->mRevision != null ) {
00795                                 //TODO: if we could plug in some code that knows about special content models *and* about
00796                                 //      special features of the search engine, the search could benefit.
00797                                 $content = $this->mRevision->getContent();
00798                                 $this->mText = $content->getTextForSearchIndex();
00799                         } else { // TODO: can we fetch raw wikitext for commons images?
00800                                 $this->mText = '';
00801                         }
00802                 }
00803         }
00804 
00809         function getTextSnippet( $terms ) {
00810                 global $wgUser, $wgAdvancedSearchHighlighting;
00811                 $this->initText();
00812 
00813                 // TODO: make highliter take a content object. Make ContentHandler a factory for SearchHighliter.
00814                 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser );
00815                 $h = new SearchHighlighter();
00816                 if ( $wgAdvancedSearchHighlighting )
00817                         return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
00818                 else
00819                         return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
00820         }
00821 
00826         function getTitleSnippet( $terms ) {
00827                 return '';
00828         }
00829 
00834         function getRedirectSnippet( $terms ) {
00835                 return '';
00836         }
00837 
00841         function getRedirectTitle() {
00842                 return null;
00843         }
00844 
00848         function getSectionSnippet() {
00849                 return '';
00850         }
00851 
00855         function getSectionTitle() {
00856                 return null;
00857         }
00858 
00862         function getTimestamp() {
00863                 if ( $this->mRevision )
00864                         return $this->mRevision->getTimestamp();
00865                 elseif ( $this->mImage )
00866                         return $this->mImage->getTimestamp();
00867                 return '';
00868         }
00869 
00873         function getWordCount() {
00874                 $this->initText();
00875                 return str_word_count( $this->mText );
00876         }
00877 
00881         function getByteSize() {
00882                 $this->initText();
00883                 return strlen( $this->mText );
00884         }
00885 
00889         function hasRelated() {
00890                 return false;
00891         }
00892 
00896         function getInterwikiPrefix() {
00897                 return '';
00898         }
00899 }
00903 class SearchNearMatchResultSet extends SearchResultSet {
00904         private $fetched = false;
00908         public function __construct( $match ) {
00909                 $this->result = $match;
00910         }
00911         public function hasResult() {
00912                 return (bool)$this->result;
00913         }
00914         public function numRows() {
00915                 return $this->hasResults() ? 1 : 0;
00916         }
00917         public function next() {
00918                 if ( $this->fetched || !$this->result ) {
00919                         return false;
00920                 }
00921                 $this->fetched = true;
00922                 return SearchResult::newFromTitle( $this->result );
00923         }
00924 }
00925 
00931 class SearchHighlighter {
00932         var $mCleanWikitext = true;
00933 
00934         function __construct( $cleanupWikitext = true ) {
00935                 $this->mCleanWikitext = $cleanupWikitext;
00936         }
00937 
00947         public function highlightText( $text, $terms, $contextlines, $contextchars ) {
00948                 global $wgContLang;
00949                 global $wgSearchHighlightBoundaries;
00950                 $fname = __METHOD__;
00951 
00952                 if ( $text == '' )
00953                         return '';
00954 
00955                 // spli text into text + templates/links/tables
00956                 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
00957                 // first capture group is for detecting nested templates/links/tables/references
00958                 $endPatterns = array(
00959                         1 => '/(\{\{)|(\}\})/', // template
00960                         2 => '/(\[\[)|(\]\])/', // image
00961                         3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
00962 
00963                 // @todo FIXME: This should prolly be a hook or something
00964                 if ( function_exists( 'wfCite' ) ) {
00965                         $spat .= '|(<ref>)'; // references via cite extension
00966                         $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
00967                 }
00968                 $spat .= '/';
00969                 $textExt = array(); // text extracts
00970                 $otherExt = array();  // other extracts
00971                 wfProfileIn( "$fname-split" );
00972                 $start = 0;
00973                 $textLen = strlen( $text );
00974                 $count = 0; // sequence number to maintain ordering
00975                 while ( $start < $textLen ) {
00976                         // find start of template/image/table
00977                         if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
00978                                 $epat = '';
00979                                 foreach ( $matches as $key => $val ) {
00980                                         if ( $key > 0 && $val[1] != - 1 ) {
00981                                                 if ( $key == 2 ) {
00982                                                         // see if this is an image link
00983                                                         $ns = substr( $val[0], 2, - 1 );
00984                                                         if ( $wgContLang->getNsIndex( $ns ) != NS_FILE )
00985                                                                 break;
00986 
00987                                                 }
00988                                                 $epat = $endPatterns[$key];
00989                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
00990                                                 $start = $val[1];
00991                                                 break;
00992                                         }
00993                                 }
00994                                 if ( $epat ) {
00995                                         // find end (and detect any nested elements)
00996                                         $level = 0;
00997                                         $offset = $start + 1;
00998                                         $found = false;
00999                                         while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
01000                                                 if ( array_key_exists( 2, $endMatches ) ) {
01001                                                         // found end
01002                                                         if ( $level == 0 ) {
01003                                                                 $len = strlen( $endMatches[2][0] );
01004                                                                 $off = $endMatches[2][1];
01005                                                                 $this->splitAndAdd( $otherExt, $count,
01006                                                                         substr( $text, $start, $off + $len  - $start ) );
01007                                                                 $start = $off + $len;
01008                                                                 $found = true;
01009                                                                 break;
01010                                                         } else {
01011                                                                 // end of nested element
01012                                                                 $level -= 1;
01013                                                         }
01014                                                 } else {
01015                                                         // nested
01016                                                         $level += 1;
01017                                                 }
01018                                                 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
01019                                         }
01020                                         if ( ! $found ) {
01021                                                 // couldn't find appropriate closing tag, skip
01022                                                 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
01023                                                 $start += strlen( $matches[0][0] );
01024                                         }
01025                                         continue;
01026                                 }
01027                         }
01028                         // else: add as text extract
01029                         $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
01030                         break;
01031                 }
01032 
01033                 $all = $textExt + $otherExt; // these have disjunct key sets
01034 
01035                 wfProfileOut( "$fname-split" );
01036 
01037                 // prepare regexps
01038                 foreach ( $terms as $index => $term ) {
01039                         // manually do upper/lowercase stuff for utf-8 since PHP won't do it
01040                         if ( preg_match( '/[\x80-\xff]/', $term ) ) {
01041                                 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
01042                         } else {
01043                                 $terms[$index] = $term;
01044                         }
01045                 }
01046                 $anyterm = implode( '|', $terms );
01047                 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
01048 
01049                 // @todo FIXME: A hack to scale contextchars, a correct solution
01050                 // would be to have contextchars actually be char and not byte
01051                 // length, and do proper utf-8 substrings and lengths everywhere,
01052                 // but PHP is making that very hard and unclean to implement :(
01053                 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
01054                 $contextchars = intval( $contextchars * $scale );
01055 
01056                 $patPre = "(^|$wgSearchHighlightBoundaries)";
01057                 $patPost = "($wgSearchHighlightBoundaries|$)";
01058 
01059                 $pat1 = "/(" . $phrase . ")/ui";
01060                 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
01061 
01062                 wfProfileIn( "$fname-extract" );
01063 
01064                 $left = $contextlines;
01065 
01066                 $snippets = array();
01067                 $offsets = array();
01068 
01069                 // show beginning only if it contains all words
01070                 $first = 0;
01071                 $firstText = '';
01072                 foreach ( $textExt as $index => $line ) {
01073                         if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
01074                                 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
01075                                 $first = $index;
01076                                 break;
01077                         }
01078                 }
01079                 if ( $firstText ) {
01080                         $succ = true;
01081                         // check if first text contains all terms
01082                         foreach ( $terms as $term ) {
01083                                 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
01084                                         $succ = false;
01085                                         break;
01086                                 }
01087                         }
01088                         if ( $succ ) {
01089                                 $snippets[$first] = $firstText;
01090                                 $offsets[$first] = 0;
01091                         }
01092                 }
01093                 if ( ! $snippets ) {
01094                         // match whole query on text
01095                         $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
01096                         // match whole query on templates/tables/images
01097                         $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
01098                         // match any words on text
01099                         $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
01100                         // match any words on templates/tables/images
01101                         $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
01102 
01103                         ksort( $snippets );
01104                 }
01105 
01106                 // add extra chars to each snippet to make snippets constant size
01107                 $extended = array();
01108                 if ( count( $snippets ) == 0 ) {
01109                         // couldn't find the target words, just show beginning of article
01110                         if ( array_key_exists( $first, $all ) ) {
01111                                 $targetchars = $contextchars * $contextlines;
01112                                 $snippets[$first] = '';
01113                                 $offsets[$first] = 0;
01114                         }
01115                 } else {
01116                         // if begin of the article contains the whole phrase, show only that !!
01117                         if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
01118                                 && $offsets[$first] < $contextchars * 2 ) {
01119                                 $snippets = array ( $first => $snippets[$first] );
01120                         }
01121 
01122                         // calc by how much to extend existing snippets
01123                         $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
01124                 }
01125 
01126                 foreach ( $snippets as $index => $line ) {
01127                         $extended[$index] = $line;
01128                         $len = strlen( $line );
01129                         if ( $len < $targetchars - 20 ) {
01130                                 // complete this line
01131                                 if ( $len < strlen( $all[$index] ) ) {
01132                                         $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
01133                                         $len = strlen( $extended[$index] );
01134                                 }
01135 
01136                                 // add more lines
01137                                 $add = $index + 1;
01138                                 while ( $len < $targetchars - 20
01139                                            && array_key_exists( $add, $all )
01140                                            && !array_key_exists( $add, $snippets ) ) {
01141                                         $offsets[$add] = 0;
01142                                         $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
01143                                         $extended[$add] = $tt;
01144                                         $len += strlen( $tt );
01145                                         $add++;
01146                                 }
01147                         }
01148                 }
01149 
01150                 // $snippets = array_map('htmlspecialchars', $extended);
01151                 $snippets = $extended;
01152                 $last = - 1;
01153                 $extract = '';
01154                 foreach ( $snippets as $index => $line ) {
01155                         if ( $last == - 1 )
01156                                 $extract .= $line; // first line
01157                         elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) )
01158                                 $extract .= " " . $line; // continous lines
01159                         else
01160                                 $extract .= '<b> ... </b>' . $line;
01161 
01162                         $last = $index;
01163                 }
01164                 if ( $extract )
01165                         $extract .= '<b> ... </b>';
01166 
01167                 $processed = array();
01168                 foreach ( $terms as $term ) {
01169                         if ( ! isset( $processed[$term] ) ) {
01170                                 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
01171                                 $extract = preg_replace( $pat3,
01172                                         "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
01173                                 $processed[$term] = true;
01174                         }
01175                 }
01176 
01177                 wfProfileOut( "$fname-extract" );
01178 
01179                 return $extract;
01180         }
01181 
01189         function splitAndAdd( &$extracts, &$count, $text ) {
01190                 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
01191                 foreach ( $split as $line ) {
01192                         $tt = trim( $line );
01193                         if ( $tt )
01194                                 $extracts[$count++] = $tt;
01195                 }
01196         }
01197 
01204         function caseCallback( $matches ) {
01205                 global $wgContLang;
01206                 if ( strlen( $matches[0] ) > 1 ) {
01207                         return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
01208                 } else {
01209                         return $matches[0];
01210                 }
01211         }
01212 
01223         function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
01224                 if ( $start != 0 ) {
01225                         $start = $this->position( $text, $start, 1 );
01226                 }
01227                 if ( $end >= strlen( $text ) ) {
01228                         $end = strlen( $text );
01229                 } else {
01230                         $end = $this->position( $text, $end );
01231                 }
01232 
01233                 if ( !is_null( $posStart ) ) {
01234                         $posStart = $start;
01235                 }
01236                 if ( !is_null( $posEnd ) ) {
01237                         $posEnd = $end;
01238                 }
01239 
01240                 if ( $end > $start )  {
01241                         return substr( $text, $start, $end - $start );
01242                 } else {
01243                         return '';
01244                 }
01245         }
01246 
01255         function position( $text, $point, $offset = 0 ) {
01256                 $tolerance = 10;
01257                 $s = max( 0, $point - $tolerance );
01258                 $l = min( strlen( $text ), $point + $tolerance ) - $s;
01259                 $m = array();
01260                 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
01261                         return $m[0][1] + $s + $offset;
01262                 } else {
01263                         // check if point is on a valid first UTF8 char
01264                         $char = ord( $text[$point] );
01265                         while ( $char >= 0x80 && $char < 0xc0 ) {
01266                                 // skip trailing bytes
01267                                 $point++;
01268                                 if ( $point >= strlen( $text ) )
01269                                         return strlen( $text );
01270                                 $char = ord( $text[$point] );
01271                         }
01272                         return $point;
01273 
01274                 }
01275         }
01276 
01288         function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
01289                 if ( $linesleft == 0 )
01290                         return; // nothing to do
01291                 foreach ( $extracts as $index => $line ) {
01292                         if ( array_key_exists( $index, $out ) )
01293                                 continue; // this line already highlighted
01294 
01295                         $m = array();
01296                         if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
01297                                 continue;
01298 
01299                         $offset = $m[0][1];
01300                         $len = strlen( $m[0][0] );
01301                         if ( $offset + $len < $contextchars )
01302                                 $begin = 0;
01303                         elseif ( $len > $contextchars )
01304                                 $begin = $offset;
01305                         else
01306                                 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
01307 
01308                         $end = $begin + $contextchars;
01309 
01310                         $posBegin = $begin;
01311                         // basic snippet from this line
01312                         $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
01313                         $offsets[$index] = $posBegin;
01314                         $linesleft--;
01315                         if ( $linesleft == 0 )
01316                                 return;
01317                 }
01318         }
01319 
01325         function removeWiki( $text ) {
01326                 $fname = __METHOD__;
01327                 wfProfileIn( $fname );
01328 
01329                 // $text = preg_replace("/'{2,5}/", "", $text);
01330                 // $text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
01331                 // $text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
01332                 // $text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
01333                 // $text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
01334                 // $text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
01335                 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
01336                 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
01337                 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
01338                 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
01339                 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
01340                 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
01341                 $text = preg_replace( "/'''''/", "", $text );
01342                 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
01343                 $text = preg_replace( "/''/", "", $text );
01344 
01345                 wfProfileOut( $fname );
01346                 return $text;
01347         }
01348 
01355         function linkReplace( $matches ) {
01356                 $colon = strpos( $matches[1], ':' );
01357                 if ( $colon === false )
01358                         return $matches[2]; // replace with caption
01359                 global $wgContLang;
01360                 $ns = substr( $matches[1], 0, $colon );
01361                 $index = $wgContLang->getNsIndex( $ns );
01362                 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) )
01363                         return $matches[0]; // return the whole thing
01364                 else
01365                         return $matches[2];
01366 
01367         }
01368 
01379         public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
01380                 global $wgContLang;
01381                 $fname = __METHOD__;
01382 
01383                 $lines = explode( "\n", $text );
01384 
01385                 $terms = implode( '|', $terms );
01386                 $max = intval( $contextchars ) + 1;
01387                 $pat1 = "/(.*)($terms)(.{0,$max})/i";
01388 
01389                 $lineno = 0;
01390 
01391                 $extract = "";
01392                 wfProfileIn( "$fname-extract" );
01393                 foreach ( $lines as $line ) {
01394                         if ( 0 == $contextlines ) {
01395                                 break;
01396                         }
01397                         ++$lineno;
01398                         $m = array();
01399                         if ( ! preg_match( $pat1, $line, $m ) ) {
01400                                 continue;
01401                         }
01402                         --$contextlines;
01403                         // truncate function changes ... to relevant i18n message.
01404                         $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
01405 
01406                         if ( count( $m ) < 3 ) {
01407                                 $post = '';
01408                         } else {
01409                                 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
01410                         }
01411 
01412                         $found = $m[2];
01413 
01414                         $line = htmlspecialchars( $pre . $found . $post );
01415                         $pat2 = '/(' . $terms . ")/i";
01416                         $line = preg_replace( $pat2,
01417                           "<span class='searchmatch'>\\1</span>", $line );
01418 
01419                         $extract .= "${line}\n";
01420                 }
01421                 wfProfileOut( "$fname-extract" );
01422 
01423                 return $extract;
01424         }
01425 
01426 }
01427 
01434 class SearchEngineDummy extends SearchEngine {
01435         // no-op
01436 }