_parser.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. """Handwritten parser of dependency specifiers.
  2. The docstring for each __parse_* function contains ENBF-inspired grammar representing
  3. the implementation.
  4. """
  5. import ast
  6. from typing import Any, List, NamedTuple, Optional, Tuple, Union
  7. from ._tokenizer import DEFAULT_RULES, Tokenizer
  8. class Node:
  9. def __init__(self, value: str) -> None:
  10. self.value = value
  11. def __str__(self) -> str:
  12. return self.value
  13. def __repr__(self) -> str:
  14. return f"<{self.__class__.__name__}('{self}')>"
  15. def serialize(self) -> str:
  16. raise NotImplementedError
  17. class Variable(Node):
  18. def serialize(self) -> str:
  19. return str(self)
  20. class Value(Node):
  21. def serialize(self) -> str:
  22. return f'"{self}"'
  23. class Op(Node):
  24. def serialize(self) -> str:
  25. return str(self)
  26. MarkerVar = Union[Variable, Value]
  27. MarkerItem = Tuple[MarkerVar, Op, MarkerVar]
  28. # MarkerAtom = Union[MarkerItem, List["MarkerAtom"]]
  29. # MarkerList = List[Union["MarkerList", MarkerAtom, str]]
  30. # mypy does not support recursive type definition
  31. # https://github.com/python/mypy/issues/731
  32. MarkerAtom = Any
  33. MarkerList = List[Any]
  34. class ParsedRequirement(NamedTuple):
  35. name: str
  36. url: str
  37. extras: List[str]
  38. specifier: str
  39. marker: Optional[MarkerList]
  40. # --------------------------------------------------------------------------------------
  41. # Recursive descent parser for dependency specifier
  42. # --------------------------------------------------------------------------------------
  43. def parse_requirement(source: str) -> ParsedRequirement:
  44. return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES))
  45. def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement:
  46. """
  47. requirement = WS? IDENTIFIER WS? extras WS? requirement_details
  48. """
  49. tokenizer.consume("WS")
  50. name_token = tokenizer.expect(
  51. "IDENTIFIER", expected="package name at the start of dependency specifier"
  52. )
  53. name = name_token.text
  54. tokenizer.consume("WS")
  55. extras = _parse_extras(tokenizer)
  56. tokenizer.consume("WS")
  57. url, specifier, marker = _parse_requirement_details(tokenizer)
  58. tokenizer.expect("END", expected="end of dependency specifier")
  59. return ParsedRequirement(name, url, extras, specifier, marker)
  60. def _parse_requirement_details(
  61. tokenizer: Tokenizer,
  62. ) -> Tuple[str, str, Optional[MarkerList]]:
  63. """
  64. requirement_details = AT URL (WS requirement_marker?)?
  65. | specifier WS? (requirement_marker)?
  66. """
  67. specifier = ""
  68. url = ""
  69. marker = None
  70. if tokenizer.check("AT"):
  71. tokenizer.read()
  72. tokenizer.consume("WS")
  73. url_start = tokenizer.position
  74. url = tokenizer.expect("URL", expected="URL after @").text
  75. if tokenizer.check("END", peek=True):
  76. return (url, specifier, marker)
  77. tokenizer.expect("WS", expected="whitespace after URL")
  78. # The input might end after whitespace.
  79. if tokenizer.check("END", peek=True):
  80. return (url, specifier, marker)
  81. marker = _parse_requirement_marker(
  82. tokenizer, span_start=url_start, after="URL and whitespace"
  83. )
  84. else:
  85. specifier_start = tokenizer.position
  86. specifier = _parse_specifier(tokenizer)
  87. tokenizer.consume("WS")
  88. if tokenizer.check("END", peek=True):
  89. return (url, specifier, marker)
  90. marker = _parse_requirement_marker(
  91. tokenizer,
  92. span_start=specifier_start,
  93. after=(
  94. "version specifier"
  95. if specifier
  96. else "name and no valid version specifier"
  97. ),
  98. )
  99. return (url, specifier, marker)
  100. def _parse_requirement_marker(
  101. tokenizer: Tokenizer, *, span_start: int, after: str
  102. ) -> MarkerList:
  103. """
  104. requirement_marker = SEMICOLON marker WS?
  105. """
  106. if not tokenizer.check("SEMICOLON"):
  107. tokenizer.raise_syntax_error(
  108. f"Expected end or semicolon (after {after})",
  109. span_start=span_start,
  110. )
  111. tokenizer.read()
  112. marker = _parse_marker(tokenizer)
  113. tokenizer.consume("WS")
  114. return marker
  115. def _parse_extras(tokenizer: Tokenizer) -> List[str]:
  116. """
  117. extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)?
  118. """
  119. if not tokenizer.check("LEFT_BRACKET", peek=True):
  120. return []
  121. with tokenizer.enclosing_tokens(
  122. "LEFT_BRACKET",
  123. "RIGHT_BRACKET",
  124. around="extras",
  125. ):
  126. tokenizer.consume("WS")
  127. extras = _parse_extras_list(tokenizer)
  128. tokenizer.consume("WS")
  129. return extras
  130. def _parse_extras_list(tokenizer: Tokenizer) -> List[str]:
  131. """
  132. extras_list = identifier (wsp* ',' wsp* identifier)*
  133. """
  134. extras: List[str] = []
  135. if not tokenizer.check("IDENTIFIER"):
  136. return extras
  137. extras.append(tokenizer.read().text)
  138. while True:
  139. tokenizer.consume("WS")
  140. if tokenizer.check("IDENTIFIER", peek=True):
  141. tokenizer.raise_syntax_error("Expected comma between extra names")
  142. elif not tokenizer.check("COMMA"):
  143. break
  144. tokenizer.read()
  145. tokenizer.consume("WS")
  146. extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma")
  147. extras.append(extra_token.text)
  148. return extras
  149. def _parse_specifier(tokenizer: Tokenizer) -> str:
  150. """
  151. specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS
  152. | WS? version_many WS?
  153. """
  154. with tokenizer.enclosing_tokens(
  155. "LEFT_PARENTHESIS",
  156. "RIGHT_PARENTHESIS",
  157. around="version specifier",
  158. ):
  159. tokenizer.consume("WS")
  160. parsed_specifiers = _parse_version_many(tokenizer)
  161. tokenizer.consume("WS")
  162. return parsed_specifiers
  163. def _parse_version_many(tokenizer: Tokenizer) -> str:
  164. """
  165. version_many = (SPECIFIER (WS? COMMA WS? SPECIFIER)*)?
  166. """
  167. parsed_specifiers = ""
  168. while tokenizer.check("SPECIFIER"):
  169. span_start = tokenizer.position
  170. parsed_specifiers += tokenizer.read().text
  171. if tokenizer.check("VERSION_PREFIX_TRAIL", peek=True):
  172. tokenizer.raise_syntax_error(
  173. ".* suffix can only be used with `==` or `!=` operators",
  174. span_start=span_start,
  175. span_end=tokenizer.position + 1,
  176. )
  177. if tokenizer.check("VERSION_LOCAL_LABEL_TRAIL", peek=True):
  178. tokenizer.raise_syntax_error(
  179. "Local version label can only be used with `==` or `!=` operators",
  180. span_start=span_start,
  181. span_end=tokenizer.position,
  182. )
  183. tokenizer.consume("WS")
  184. if not tokenizer.check("COMMA"):
  185. break
  186. parsed_specifiers += tokenizer.read().text
  187. tokenizer.consume("WS")
  188. return parsed_specifiers
  189. # --------------------------------------------------------------------------------------
  190. # Recursive descent parser for marker expression
  191. # --------------------------------------------------------------------------------------
  192. def parse_marker(source: str) -> MarkerList:
  193. return _parse_full_marker(Tokenizer(source, rules=DEFAULT_RULES))
  194. def _parse_full_marker(tokenizer: Tokenizer) -> MarkerList:
  195. retval = _parse_marker(tokenizer)
  196. tokenizer.expect("END", expected="end of marker expression")
  197. return retval
  198. def _parse_marker(tokenizer: Tokenizer) -> MarkerList:
  199. """
  200. marker = marker_atom (BOOLOP marker_atom)+
  201. """
  202. expression = [_parse_marker_atom(tokenizer)]
  203. while tokenizer.check("BOOLOP"):
  204. token = tokenizer.read()
  205. expr_right = _parse_marker_atom(tokenizer)
  206. expression.extend((token.text, expr_right))
  207. return expression
  208. def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom:
  209. """
  210. marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS?
  211. | WS? marker_item WS?
  212. """
  213. tokenizer.consume("WS")
  214. if tokenizer.check("LEFT_PARENTHESIS", peek=True):
  215. with tokenizer.enclosing_tokens(
  216. "LEFT_PARENTHESIS",
  217. "RIGHT_PARENTHESIS",
  218. around="marker expression",
  219. ):
  220. tokenizer.consume("WS")
  221. marker: MarkerAtom = _parse_marker(tokenizer)
  222. tokenizer.consume("WS")
  223. else:
  224. marker = _parse_marker_item(tokenizer)
  225. tokenizer.consume("WS")
  226. return marker
  227. def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem:
  228. """
  229. marker_item = WS? marker_var WS? marker_op WS? marker_var WS?
  230. """
  231. tokenizer.consume("WS")
  232. marker_var_left = _parse_marker_var(tokenizer)
  233. tokenizer.consume("WS")
  234. marker_op = _parse_marker_op(tokenizer)
  235. tokenizer.consume("WS")
  236. marker_var_right = _parse_marker_var(tokenizer)
  237. tokenizer.consume("WS")
  238. return (marker_var_left, marker_op, marker_var_right)
  239. def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar:
  240. """
  241. marker_var = VARIABLE | QUOTED_STRING
  242. """
  243. if tokenizer.check("VARIABLE"):
  244. return process_env_var(tokenizer.read().text.replace(".", "_"))
  245. elif tokenizer.check("QUOTED_STRING"):
  246. return process_python_str(tokenizer.read().text)
  247. else:
  248. tokenizer.raise_syntax_error(
  249. message="Expected a marker variable or quoted string"
  250. )
  251. def process_env_var(env_var: str) -> Variable:
  252. if env_var in ("platform_python_implementation", "python_implementation"):
  253. return Variable("platform_python_implementation")
  254. else:
  255. return Variable(env_var)
  256. def process_python_str(python_str: str) -> Value:
  257. value = ast.literal_eval(python_str)
  258. return Value(str(value))
  259. def _parse_marker_op(tokenizer: Tokenizer) -> Op:
  260. """
  261. marker_op = IN | NOT IN | OP
  262. """
  263. if tokenizer.check("IN"):
  264. tokenizer.read()
  265. return Op("in")
  266. elif tokenizer.check("NOT"):
  267. tokenizer.read()
  268. tokenizer.expect("WS", expected="whitespace after 'not'")
  269. tokenizer.expect("IN", expected="'in' after 'not'")
  270. return Op("not in")
  271. elif tokenizer.check("OP"):
  272. return Op(tokenizer.read().text)
  273. else:
  274. return tokenizer.raise_syntax_error(
  275. "Expected marker operator, one of "
  276. "<=, <, !=, ==, >=, >, ~=, ===, in, not in"
  277. )