_tokenizer.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. import contextlib
  2. import re
  3. from dataclasses import dataclass
  4. from typing import Dict, Iterator, NoReturn, Optional, Tuple, Union
  5. from .specifiers import Specifier
  6. @dataclass
  7. class Token:
  8. name: str
  9. text: str
  10. position: int
  11. class ParserSyntaxError(Exception):
  12. """The provided source text could not be parsed correctly."""
  13. def __init__(
  14. self,
  15. message: str,
  16. *,
  17. source: str,
  18. span: Tuple[int, int],
  19. ) -> None:
  20. self.span = span
  21. self.message = message
  22. self.source = source
  23. super().__init__()
  24. def __str__(self) -> str:
  25. marker = " " * self.span[0] + "~" * (self.span[1] - self.span[0]) + "^"
  26. return "\n ".join([self.message, self.source, marker])
  27. DEFAULT_RULES: "Dict[str, Union[str, re.Pattern[str]]]" = {
  28. "LEFT_PARENTHESIS": r"\(",
  29. "RIGHT_PARENTHESIS": r"\)",
  30. "LEFT_BRACKET": r"\[",
  31. "RIGHT_BRACKET": r"\]",
  32. "SEMICOLON": r";",
  33. "COMMA": r",",
  34. "QUOTED_STRING": re.compile(
  35. r"""
  36. (
  37. ('[^']*')
  38. |
  39. ("[^"]*")
  40. )
  41. """,
  42. re.VERBOSE,
  43. ),
  44. "OP": r"(===|==|~=|!=|<=|>=|<|>)",
  45. "BOOLOP": r"\b(or|and)\b",
  46. "IN": r"\bin\b",
  47. "NOT": r"\bnot\b",
  48. "VARIABLE": re.compile(
  49. r"""
  50. \b(
  51. python_version
  52. |python_full_version
  53. |os[._]name
  54. |sys[._]platform
  55. |platform_(release|system)
  56. |platform[._](version|machine|python_implementation)
  57. |python_implementation
  58. |implementation_(name|version)
  59. |extra
  60. )\b
  61. """,
  62. re.VERBOSE,
  63. ),
  64. "SPECIFIER": re.compile(
  65. Specifier._operator_regex_str + Specifier._version_regex_str,
  66. re.VERBOSE | re.IGNORECASE,
  67. ),
  68. "AT": r"\@",
  69. "URL": r"[^ \t]+",
  70. "IDENTIFIER": r"\b[a-zA-Z0-9][a-zA-Z0-9._-]*\b",
  71. "VERSION_PREFIX_TRAIL": r"\.\*",
  72. "VERSION_LOCAL_LABEL_TRAIL": r"\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*",
  73. "WS": r"[ \t]+",
  74. "END": r"$",
  75. }
  76. class Tokenizer:
  77. """Context-sensitive token parsing.
  78. Provides methods to examine the input stream to check whether the next token
  79. matches.
  80. """
  81. def __init__(
  82. self,
  83. source: str,
  84. *,
  85. rules: "Dict[str, Union[str, re.Pattern[str]]]",
  86. ) -> None:
  87. self.source = source
  88. self.rules: Dict[str, re.Pattern[str]] = {
  89. name: re.compile(pattern) for name, pattern in rules.items()
  90. }
  91. self.next_token: Optional[Token] = None
  92. self.position = 0
  93. def consume(self, name: str) -> None:
  94. """Move beyond provided token name, if at current position."""
  95. if self.check(name):
  96. self.read()
  97. def check(self, name: str, *, peek: bool = False) -> bool:
  98. """Check whether the next token has the provided name.
  99. By default, if the check succeeds, the token *must* be read before
  100. another check. If `peek` is set to `True`, the token is not loaded and
  101. would need to be checked again.
  102. """
  103. assert (
  104. self.next_token is None
  105. ), f"Cannot check for {name!r}, already have {self.next_token!r}"
  106. assert name in self.rules, f"Unknown token name: {name!r}"
  107. expression = self.rules[name]
  108. match = expression.match(self.source, self.position)
  109. if match is None:
  110. return False
  111. if not peek:
  112. self.next_token = Token(name, match[0], self.position)
  113. return True
  114. def expect(self, name: str, *, expected: str) -> Token:
  115. """Expect a certain token name next, failing with a syntax error otherwise.
  116. The token is *not* read.
  117. """
  118. if not self.check(name):
  119. raise self.raise_syntax_error(f"Expected {expected}")
  120. return self.read()
  121. def read(self) -> Token:
  122. """Consume the next token and return it."""
  123. token = self.next_token
  124. assert token is not None
  125. self.position += len(token.text)
  126. self.next_token = None
  127. return token
  128. def raise_syntax_error(
  129. self,
  130. message: str,
  131. *,
  132. span_start: Optional[int] = None,
  133. span_end: Optional[int] = None,
  134. ) -> NoReturn:
  135. """Raise ParserSyntaxError at the given position."""
  136. span = (
  137. self.position if span_start is None else span_start,
  138. self.position if span_end is None else span_end,
  139. )
  140. raise ParserSyntaxError(
  141. message,
  142. source=self.source,
  143. span=span,
  144. )
  145. @contextlib.contextmanager
  146. def enclosing_tokens(
  147. self, open_token: str, close_token: str, *, around: str
  148. ) -> Iterator[None]:
  149. if self.check(open_token):
  150. open_position = self.position
  151. self.read()
  152. else:
  153. open_position = None
  154. yield
  155. if open_position is None:
  156. return
  157. if not self.check(close_token):
  158. self.raise_syntax_error(
  159. f"Expected matching {close_token} for {open_token}, after {around}",
  160. span_start=open_position,
  161. )
  162. self.read()