Lexer.php 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. <?php declare(strict_types=1);
  2. namespace PhpParser;
  3. require __DIR__ . '/compatibility_tokens.php';
  4. class Lexer {
  5. /**
  6. * Tokenize the provided source code.
  7. *
  8. * The token array is in the same format as provided by the PhpToken::tokenize() method in
  9. * PHP 8.0. The tokens are instances of PhpParser\Token, to abstract over a polyfill
  10. * implementation in earlier PHP version.
  11. *
  12. * The token array is terminated by a sentinel token with token ID 0.
  13. * The token array does not discard any tokens (i.e. whitespace and comments are included).
  14. * The token position attributes are against this token array.
  15. *
  16. * @param string $code The source code to tokenize.
  17. * @param ErrorHandler|null $errorHandler Error handler to use for lexing errors. Defaults to
  18. * ErrorHandler\Throwing.
  19. * @return Token[] Tokens
  20. */
  21. public function tokenize(string $code, ?ErrorHandler $errorHandler = null): array {
  22. if (null === $errorHandler) {
  23. $errorHandler = new ErrorHandler\Throwing();
  24. }
  25. $scream = ini_set('xdebug.scream', '0');
  26. $tokens = @Token::tokenize($code);
  27. $this->postprocessTokens($tokens, $errorHandler);
  28. if (false !== $scream) {
  29. ini_set('xdebug.scream', $scream);
  30. }
  31. return $tokens;
  32. }
  33. private function handleInvalidCharacter(Token $token, ErrorHandler $errorHandler): void {
  34. $chr = $token->text;
  35. if ($chr === "\0") {
  36. // PHP cuts error message after null byte, so need special case
  37. $errorMsg = 'Unexpected null byte';
  38. } else {
  39. $errorMsg = sprintf(
  40. 'Unexpected character "%s" (ASCII %d)', $chr, ord($chr)
  41. );
  42. }
  43. $errorHandler->handleError(new Error($errorMsg, [
  44. 'startLine' => $token->line,
  45. 'endLine' => $token->line,
  46. 'startFilePos' => $token->pos,
  47. 'endFilePos' => $token->pos,
  48. ]));
  49. }
  50. private function isUnterminatedComment(Token $token): bool {
  51. return $token->is([\T_COMMENT, \T_DOC_COMMENT])
  52. && substr($token->text, 0, 2) === '/*'
  53. && substr($token->text, -2) !== '*/';
  54. }
  55. /**
  56. * @param list<Token> $tokens
  57. */
  58. protected function postprocessTokens(array &$tokens, ErrorHandler $errorHandler): void {
  59. // This function reports errors (bad characters and unterminated comments) in the token
  60. // array, and performs certain canonicalizations:
  61. // * Use PHP 8.1 T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG and
  62. // T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG tokens used to disambiguate intersection types.
  63. // * Add a sentinel token with ID 0.
  64. $numTokens = \count($tokens);
  65. if ($numTokens === 0) {
  66. // Empty input edge case: Just add the sentinel token.
  67. $tokens[] = new Token(0, "\0", 1, 0);
  68. return;
  69. }
  70. for ($i = 0; $i < $numTokens; $i++) {
  71. $token = $tokens[$i];
  72. if ($token->id === \T_BAD_CHARACTER) {
  73. $this->handleInvalidCharacter($token, $errorHandler);
  74. }
  75. if ($token->id === \ord('&')) {
  76. $next = $i + 1;
  77. while (isset($tokens[$next]) && $tokens[$next]->id === \T_WHITESPACE) {
  78. $next++;
  79. }
  80. $followedByVarOrVarArg = isset($tokens[$next]) &&
  81. $tokens[$next]->is([\T_VARIABLE, \T_ELLIPSIS]);
  82. $token->id = $followedByVarOrVarArg
  83. ? \T_AMPERSAND_FOLLOWED_BY_VAR_OR_VARARG
  84. : \T_AMPERSAND_NOT_FOLLOWED_BY_VAR_OR_VARARG;
  85. }
  86. }
  87. // Check for unterminated comment
  88. $lastToken = $tokens[$numTokens - 1];
  89. if ($this->isUnterminatedComment($lastToken)) {
  90. $errorHandler->handleError(new Error('Unterminated comment', [
  91. 'startLine' => $lastToken->line,
  92. 'endLine' => $lastToken->getEndLine(),
  93. 'startFilePos' => $lastToken->pos,
  94. 'endFilePos' => $lastToken->getEndPos(),
  95. ]));
  96. }
  97. // Add sentinel token.
  98. $tokens[] = new Token(0, "\0", $lastToken->getEndLine(), $lastToken->getEndPos());
  99. }
  100. }