MediaWiki  master
CLDRPluralRuleEvaluator.php
Go to the documentation of this file.
00001 <?php
00015 class CLDRPluralRuleEvaluator {
00024         public static function evaluate( $number, array $rules ) {
00025                 $rules = self::compile( $rules );
00026                 return self::evaluateCompiled( $number, $rules );
00027         }
00028 
00036         public static function compile( array $rules ) {
00037                 // We can't use array_map() for this because it generates a warning if
00038                 // there is an exception.
00039                 foreach ( $rules as &$rule ) {
00040                         $rule = CLDRPluralRuleConverter::convert( $rule );
00041                 }
00042                 return $rules;
00043         }
00044 
00049         public static function evaluateCompiled( $number, array $rules ) {
00050                 // The compiled form is RPN, with tokens strictly delimited by
00051                 // spaces, so this is a simple RPN evaluator.
00052                 foreach ( $rules as $i => $rule  ) {
00053                         $stack = array();
00054                         $zero = ord( '0' );
00055                         $nine = ord( '9' );
00056                         foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
00057                                 $ord = ord( $token );
00058                                 if ( $token === 'n' ) {
00059                                         $stack[] = $number;
00060                                 } elseif ( $ord >= $zero && $ord <= $nine ) {
00061                                         $stack[] = intval( $token );
00062                                 } else {
00063                                         $right = array_pop( $stack );
00064                                         $left = array_pop( $stack );
00065                                         $result = self::doOperation( $token, $left, $right );
00066                                         $stack[] = $result;
00067                                 }
00068                         }
00069                         if ( $stack[0] ) {
00070                                 return $i;
00071                         }
00072                 }
00073                 // None of the provided rules match. The number belongs to caregory
00074                 // 'other' which comes last.
00075                 return count( $rules );
00076         }
00077 
00087         private static function doOperation( $token, $left, $right ) {
00088                 if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
00089                         if ( !($right instanceof CLDRPluralRuleEvaluator_Range ) ) {
00090                                 $right = new CLDRPluralRuleEvaluator_Range( $right );
00091                         }
00092                 }
00093                 switch ( $token ) {
00094                         case 'or':
00095                                 return $left || $right;
00096                         case 'and':
00097                                 return $left && $right;
00098                         case 'is':
00099                                 return $left == $right;
00100                         case 'is-not':
00101                                 return $left != $right;
00102                         case 'in':
00103                                 return $right->isNumberIn( $left );
00104                         case 'not-in':
00105                                 return !$right->isNumberIn( $left );
00106                         case 'within':
00107                                 return $right->isNumberWithin( $left );
00108                         case 'not-within':
00109                                 return !$right->isNumberWithin( $left );
00110                         case 'mod':
00111                                 if ( is_int( $left ) ) {
00112                                         return (int) fmod( $left, $right );
00113                                 }
00114                                 return fmod( $left, $right );
00115                         case ',':
00116                                 if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
00117                                         $range = $left;
00118                                 } else {
00119                                         $range = new CLDRPluralRuleEvaluator_Range( $left );
00120                                 }
00121                                 $range->add( $right );
00122                                 return $range;
00123                         case '..':
00124                                 return new CLDRPluralRuleEvaluator_Range( $left, $right );
00125                         default:
00126                                 throw new CLDRPluralRuleError( "Invalid RPN token" );
00127                 }
00128         }
00129 }
00130 
00134 class CLDRPluralRuleEvaluator_Range {
00135         public $parts = array();
00136 
00137         function __construct( $start, $end = false ) {
00138                 if ( $end === false ) {
00139                         $this->parts[] = $start;
00140                 } else {
00141                         $this->parts[] = array( $start, $end );
00142                 }
00143         }
00144 
00150         function isNumberIn( $number, $integerConstraint = true ) {
00151                 foreach ( $this->parts as $part ) {
00152                         if ( is_array( $part ) ) {
00153                                 if ( ( !$integerConstraint || floor( $number ) === (float)$number )
00154                                         && $number >= $part[0] && $number <= $part[1] )
00155                                 {
00156                                         return true;
00157                                 }
00158                         } else {
00159                                 if ( $number == $part ) {
00160                                         return true;
00161                                 }
00162                         }
00163                 }
00164                 return false;
00165         }
00166 
00171         function isNumberWithin( $number ) {
00172                 return $this->isNumberIn( $number, false );
00173         }
00174 
00179         function add( $other ) {
00180                 if ( $other instanceof self ) {
00181                         $this->parts = array_merge( $this->parts, $other->parts );
00182                 } else {
00183                         $this->parts[] = $other;
00184                 }
00185         }
00186 
00190         function __toString() {
00191                 $s = 'Range(';
00192                 foreach ( $this->parts as $i => $part ) {
00193                         if ( $i ) {
00194                                 $s .= ', ';
00195                         }
00196                         if ( is_array( $part ) ) {
00197                                 $s .= $part[0] . '..' . $part[1];
00198                         } else {
00199                                 $s .= $part;
00200                         }
00201                 }
00202                 $s .= ')';
00203                 return $s;
00204         }
00205 
00206 }
00207 
00211 class CLDRPluralRuleConverter {
00212         public $rule, $pos, $end;
00213         public $operators = array();
00214         public $operands = array();
00215 
00221         static $precedence = array(
00222                 'or' => 2,
00223                 'and' => 3,
00224                 'is' => 4,
00225                 'is-not' => 4,
00226                 'in' => 4,
00227                 'not-in' => 4,
00228                 'within' => 4,
00229                 'not-within' => 4,
00230                 'mod' => 5,
00231                 ',' => 6,
00232                 '..' => 7,
00233         );
00234 
00238         const WHITESPACE_CLASS = " \t\r\n";
00239 
00244         const NUMBER_CLASS = '0123456789';
00245 
00249         const WORD_REGEX = '/[a-zA-Z]+/A';
00250 
00254         public static function convert( $rule ) {
00255                 $parser = new self( $rule );
00256                 return $parser->doConvert();
00257         }
00258 
00262         protected function __construct( $rule ) {
00263                 $this->rule = $rule;
00264                 $this->pos = 0;
00265                 $this->end = strlen( $rule );
00266         }
00267 
00271         protected function doConvert() {
00272                 $expectOperator = true;
00273 
00274                 // Iterate through all tokens, saving the operators and operands to a
00275                 // stack per Dijkstra's shunting yard algorithm.
00276                 while ( false !== ( $token = $this->nextToken() ) ) {
00277                         // In this grammar, there are only binary operators, so every valid
00278                         // rule string will alternate between operator and operand tokens.
00279                         $expectOperator = !$expectOperator;
00280 
00281                         if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
00282                                 // Operand
00283                                 if ( $expectOperator ) {
00284                                         $token->error( 'unexpected operand' );
00285                                 }
00286                                 $this->operands[] = $token;
00287                                 continue;
00288                         } else {
00289                                 // Operator
00290                                 if  ( !$expectOperator ) {
00291                                         $token->error( 'unexpected operator' );
00292                                 }
00293                                 // Resolve higher precedence levels
00294                                 $lastOp = end( $this->operators );
00295                                 while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
00296                                         $this->doOperation( $lastOp, $this->operands );
00297                                         array_pop( $this->operators );
00298                                         $lastOp = end( $this->operators );
00299                                 }
00300                                 $this->operators[] = $token;
00301                         }
00302                 }
00303 
00304                 // Finish off the stack
00305                 while ( $op = array_pop( $this->operators ) ) {
00306                         $this->doOperation( $op, $this->operands );
00307                 }
00308 
00309                 // Make sure the result is sane. The first case is possible for an empty
00310                 // string input, the second should be unreachable.
00311                 if ( !count( $this->operands ) ) {
00312                         $this->error( 'condition expected' );
00313                 } elseif ( count( $this->operands ) > 1 ) {
00314                         $this->error( 'missing operator or too many operands' );
00315                 }
00316 
00317                 $value = $this->operands[0];
00318                 if ( $value->type !== 'boolean' ) {
00319                         $this->error( 'the result must have a boolean type' );
00320                 }
00321 
00322                 return $this->operands[0]->rpn;
00323         }
00324 
00329         protected function nextToken() {
00330                 if ( $this->pos >= $this->end ) {
00331                         return false;
00332                 }
00333 
00334                 // Whitespace
00335                 $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
00336                 $this->pos += $length;
00337 
00338                 if ( $this->pos >= $this->end ) {
00339                         return false;
00340                 }
00341 
00342                 // Number
00343                 $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
00344                 if ( $length !== 0 ) {
00345                         $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
00346                         $this->pos += $length;
00347                         return $token;
00348                 }
00349 
00350                 // Comma
00351                 if ( $this->rule[$this->pos] === ',' ) {
00352                         $token = $this->newOperator( ',', $this->pos, 1 );
00353                         $this->pos ++;
00354                         return $token;
00355                 }
00356 
00357                 // Dot dot
00358                 if ( substr( $this->rule, $this->pos, 2 ) === '..' ) {
00359                         $token = $this->newOperator( '..', $this->pos, 2 );
00360                         $this->pos += 2;
00361                         return $token;
00362                 }
00363 
00364                 // Word
00365                 if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
00366                         $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"'  );
00367                 }
00368                 $word1 = strtolower( $m[0] );
00369                 $word2 = '';
00370                 $nextTokenPos = $this->pos + strlen( $word1 );
00371                 if ( $word1 === 'not' || $word1 === 'is' ) {
00372                         // Look ahead one word
00373                         $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
00374                         if ( $nextTokenPos < $this->end
00375                                         && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos ) )
00376                         {
00377                                 $word2 = strtolower( $m[0] );
00378                                 $nextTokenPos += strlen( $word2 );
00379                         }
00380                 }
00381 
00382                 // Two-word operators like "is not" take precedence over single-word operators like "is"
00383                 if ( $word2 !== '' ) {
00384                         $bothWords = "{$word1}-{$word2}";
00385                         if ( isset( self::$precedence[$bothWords] ) ) {
00386                                 $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
00387                                 $this->pos = $nextTokenPos;
00388                                 return $token;
00389                         }
00390                 }
00391 
00392                 // Single-word operators
00393                 if ( isset( self::$precedence[$word1] ) ) {
00394                         $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
00395                         $this->pos += strlen( $word1 );
00396                         return $token;
00397                 }
00398 
00399                 // The special numerical keyword "n"
00400                 if ( $word1 === 'n' ) {
00401                         $token = $this->newNumber( 'n', $this->pos );
00402                         $this->pos ++;
00403                         return $token;
00404                 }
00405 
00406                 $this->error( 'unrecognised word' );
00407         }
00408 
00414         protected function doOperation( $op ) {
00415                 if ( count( $this->operands ) < 2 ) {
00416                         $op->error( 'missing operand' );
00417                 }
00418                 $right = array_pop( $this->operands );
00419                 $left = array_pop( $this->operands );
00420                 $result = $op->operate( $left, $right );
00421                 $this->operands[] = $result;
00422         }
00423 
00427         protected function newNumber( $text, $pos ) {
00428                 return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
00429         }
00430 
00434         protected function newOperator( $type, $pos, $length ) {
00435                 return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
00436         }
00437 
00441         protected function error( $message ) {
00442                 throw new CLDRPluralRuleError( $message );
00443         }
00444 }
00445 
00450 class CLDRPluralRuleConverter_Fragment {
00451         public $parser, $pos, $length, $end;
00452 
00453         function __construct( $parser, $pos, $length ) {
00454                 $this->parser = $parser;
00455                 $this->pos = $pos;
00456                 $this->length = $length;
00457                 $this->end = $pos + $length;
00458         }
00459 
00460         public function error( $message ) {
00461                 $text = $this->getText();
00462                 throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
00463         }
00464 
00465         public function getText() {
00466                 return substr( $this->parser->rule, $this->pos, $this->length );
00467         }
00468 }
00469 
00476 class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
00477         public $type, $rpn;
00478 
00479         function __construct( $parser, $type, $rpn, $pos, $length ) {
00480                 parent::__construct( $parser, $pos, $length );
00481                 $this->type = $type;
00482                 $this->rpn = $rpn;
00483         }
00484 
00485         public function isType( $type ) {
00486                 if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
00487                         return true;
00488                 }
00489                 if ( $type === $this->type ) {
00490                         return true;
00491                 }
00492                 return false;
00493         }
00494 }
00495 
00501 class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
00502         public $name;
00503 
00513         static $opTypes = array(
00514                 'or' => 'bbb',
00515                 'and' => 'bbb',
00516                 'is' => 'nnb',
00517                 'is-not' => 'nnb',
00518                 'in' => 'nrb',
00519                 'not-in' => 'nrb',
00520                 'within' => 'nrb',
00521                 'not-within' => 'nrb',
00522                 'mod' => 'nnn',
00523                 ',' => 'rrr',
00524                 '..' => 'nnr',
00525         );
00526 
00530         static $typeSpecMap = array(
00531                 'b' => 'boolean',
00532                 'n' => 'number',
00533                 'r' => 'range',
00534         );
00535 
00536         function __construct( $parser, $name, $pos, $length ) {
00537                 parent::__construct( $parser, $pos, $length );
00538                 $this->name = $name;
00539         }
00540 
00541         public function operate( $left, $right ) {
00542                 $typeSpec = self::$opTypes[$this->name];
00543 
00544                 $leftType = self::$typeSpecMap[$typeSpec[0]];
00545                 $rightType = self::$typeSpecMap[$typeSpec[1]];
00546                 $resultType = self::$typeSpecMap[$typeSpec[2]];
00547 
00548                 $start = min( $this->pos, $left->pos, $right->pos );
00549                 $end = max( $this->end, $left->end, $right->end );
00550                 $length = $end - $start;
00551 
00552                 $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
00553                         "{$left->rpn} {$right->rpn} {$this->name}",
00554                         $start, $length );
00555 
00556                 if ( !$left->isType( $leftType ) ) {
00557                         $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
00558                 }
00559 
00560                 if ( !$right->isType( $rightType ) ) {
00561                         $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
00562                 }
00563                 return $newExpr;
00564         }
00565 }
00566 
00571 class CLDRPluralRuleError extends MWException {
00572         function __construct( $message ) {
00573                 parent::__construct( 'CLDR plural rule error: ' . $message );
00574         }
00575 }