123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- <?php declare(strict_types=1);
- namespace PhpParser;
- require __DIR__ . '/compatibility_tokens.php';
- class Lexer {
- /**
- * Tokenize the provided source code.
- *
- * The token array is in the same format as provided by the PhpToken::tokenize() method in
- * PHP 8.0. The tokens are instances of PhpParser\Token, to abstract over a polyfill
- * implementation in earlier PHP version.
- *
- * The token array is terminated by a sentinel token with token ID 0.
- * The token array does not discard any tokens (i.e. whitespace and comments are included).
- * The token position attributes are against this token array.
- *
- * @param string $code The source code to tokenize.
- * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
- * ErrorHandler\Throwing.
- * @return Token[] Tokens
- */
- public function tokenize(string $code, ?ErrorHandler $errorHandler = null): array {
- if (null === $errorHandler) {
- $errorHandler = new ErrorHandler\Throwing();
- }
- $scream = ini_set('xdebug.scream', '0');
- $tokens = @Token::tokenize($code);
- $this->postprocessTokens($tokens, $errorHandler);
- if (false !== $scream) {
- ini_set('xdebug.scream', $scream);
- }
- return $tokens;
- }
- private function handleInvalidCharacter(Token $token, ErrorHandler $errorHandler): void {
- $chr = $token->text;
- if ($chr === "\0") {
- // PHP cuts error message after null byte, so need special case
- $errorMsg = 'Unexpected null byte';
- } else {
- $errorMsg = sprintf(
- 'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
- );
- }
- $errorHandler->handleError(new Error($errorMsg, [
- 'startLine' => $token->line,
- 'endLine' => $token->line,
- 'startFilePos' => $token->pos,
- 'endFilePos' => $token->pos,
- ]));
- }
- private function isUnterminatedComment(Token $token): bool {
- return $token->is([\T_COMMENT, \T_DOC_COMMENT])
- && substr($token->text, 0, 2) === '/*'
- && substr($token->text, -2) !== '*/';
- }
- /**
- * @param list<Token> $tokens
- */
- protected function postprocessTokens(array &$tokens, ErrorHandler $errorHandler): void {
- // This function reports errors (bad characters and unterminated comments) in the token
- // array, and performs certain canonicalizations:
- // * Use PHP 8.1 T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG and
- // T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG tokens used to disambiguate intersection types.
- // * Add a sentinel token with ID 0.
- $numTokens = \count($tokens);
- if ($numTokens === 0) {
- // Empty input edge case: Just add the sentinel token.
- $tokens[] = new Token(0, "\0", 1, 0);
- return;
- }
- for ($i = 0; $i < $numTokens; $i++) {
- $token = $tokens[$i];
- if ($token->id === \T_BAD_CHARACTER) {
- $this->handleInvalidCharacter($token, $errorHandler);
- }
- if ($token->id === \ord('&')) {
- $next = $i + 1;
- while (isset($tokens[$next]) && $tokens[$next]->id === \T_WHITESPACE) {
- $next++;
- }
- $followedByVarOrVarArg = isset($tokens[$next]) &&
- $tokens[$next]->is([\T_VARIABLE, \T_ELLIPSIS]);
- $token->id = $followedByVarOrVarArg
- ? \T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG
- : \T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG;
- }
- }
- // Check for unterminated comment
- $lastToken = $tokens[$numTokens - 1];
- if ($this->isUnterminatedComment($lastToken)) {
- $errorHandler->handleError(new Error('Unterminated comment', [
- 'startLine' => $lastToken->line,
- 'endLine' => $lastToken->getEndLine(),
- 'startFilePos' => $lastToken->pos,
- 'endFilePos' => $lastToken->getEndPos(),
- ]));
- }
- // Add sentinel token.
- $tokens[] = new Token(0, "\0", $lastToken->getEndLine(), $lastToken->getEndPos());
- }
- }
|