common.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. # common.py
  2. from .core import *
  3. from .helpers import delimited_list, any_open_tag, any_close_tag
  4. from datetime import datetime
  5. # some other useful expressions - using lower-case class name since we are really using this as a namespace
  6. class pyparsing_common:
  7. """Here are some common low-level expressions that may be useful in
  8. jump-starting parser development:
  9. - numeric forms (:class:`integers<integer>`, :class:`reals<real>`,
  10. :class:`scientific notation<sci_real>`)
  11. - common :class:`programming identifiers<identifier>`
  12. - network addresses (:class:`MAC<mac_address>`,
  13. :class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`)
  14. - ISO8601 :class:`dates<iso8601_date>` and
  15. :class:`datetime<iso8601_datetime>`
  16. - :class:`UUID<uuid>`
  17. - :class:`comma-separated list<comma_separated_list>`
  18. - :class:`url`
  19. Parse actions:
  20. - :class:`convertToInteger`
  21. - :class:`convertToFloat`
  22. - :class:`convertToDate`
  23. - :class:`convertToDatetime`
  24. - :class:`stripHTMLTags`
  25. - :class:`upcaseTokens`
  26. - :class:`downcaseTokens`
  27. Example::
  28. pyparsing_common.number.runTests('''
  29. # any int or real number, returned as the appropriate type
  30. 100
  31. -100
  32. +100
  33. 3.14159
  34. 6.02e23
  35. 1e-12
  36. ''')
  37. pyparsing_common.fnumber.runTests('''
  38. # any int or real number, returned as float
  39. 100
  40. -100
  41. +100
  42. 3.14159
  43. 6.02e23
  44. 1e-12
  45. ''')
  46. pyparsing_common.hex_integer.runTests('''
  47. # hex numbers
  48. 100
  49. FF
  50. ''')
  51. pyparsing_common.fraction.runTests('''
  52. # fractions
  53. 1/2
  54. -3/4
  55. ''')
  56. pyparsing_common.mixed_integer.runTests('''
  57. # mixed fractions
  58. 1
  59. 1/2
  60. -3/4
  61. 1-3/4
  62. ''')
  63. import uuid
  64. pyparsing_common.uuid.setParseAction(tokenMap(uuid.UUID))
  65. pyparsing_common.uuid.runTests('''
  66. # uuid
  67. 12345678-1234-5678-1234-567812345678
  68. ''')
  69. prints::
  70. # any int or real number, returned as the appropriate type
  71. 100
  72. [100]
  73. -100
  74. [-100]
  75. +100
  76. [100]
  77. 3.14159
  78. [3.14159]
  79. 6.02e23
  80. [6.02e+23]
  81. 1e-12
  82. [1e-12]
  83. # any int or real number, returned as float
  84. 100
  85. [100.0]
  86. -100
  87. [-100.0]
  88. +100
  89. [100.0]
  90. 3.14159
  91. [3.14159]
  92. 6.02e23
  93. [6.02e+23]
  94. 1e-12
  95. [1e-12]
  96. # hex numbers
  97. 100
  98. [256]
  99. FF
  100. [255]
  101. # fractions
  102. 1/2
  103. [0.5]
  104. -3/4
  105. [-0.75]
  106. # mixed fractions
  107. 1
  108. [1]
  109. 1/2
  110. [0.5]
  111. -3/4
  112. [-0.75]
  113. 1-3/4
  114. [1.75]
  115. # uuid
  116. 12345678-1234-5678-1234-567812345678
  117. [UUID('12345678-1234-5678-1234-567812345678')]
  118. """
  119. convert_to_integer = token_map(int)
  120. """
  121. Parse action for converting parsed integers to Python int
  122. """
  123. convert_to_float = token_map(float)
  124. """
  125. Parse action for converting parsed numbers to Python float
  126. """
  127. integer = Word(nums).set_name("integer").set_parse_action(convert_to_integer)
  128. """expression that parses an unsigned integer, returns an int"""
  129. hex_integer = (
  130. Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16))
  131. )
  132. """expression that parses a hexadecimal integer, returns an int"""
  133. signed_integer = (
  134. Regex(r"[+-]?\d+")
  135. .set_name("signed integer")
  136. .set_parse_action(convert_to_integer)
  137. )
  138. """expression that parses an integer with optional leading sign, returns an int"""
  139. fraction = (
  140. signed_integer().set_parse_action(convert_to_float)
  141. + "/"
  142. + signed_integer().set_parse_action(convert_to_float)
  143. ).set_name("fraction")
  144. """fractional expression of an integer divided by an integer, returns a float"""
  145. fraction.add_parse_action(lambda tt: tt[0] / tt[-1])
  146. mixed_integer = (
  147. fraction | signed_integer + Opt(Opt("-").suppress() + fraction)
  148. ).set_name("fraction or mixed integer-fraction")
  149. """mixed integer of the form 'integer - fraction', with optional leading integer, returns float"""
  150. mixed_integer.add_parse_action(sum)
  151. real = (
  152. Regex(r"[+-]?(?:\d+\.\d*|\.\d+)")
  153. .set_name("real number")
  154. .set_parse_action(convert_to_float)
  155. )
  156. """expression that parses a floating point number and returns a float"""
  157. sci_real = (
  158. Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)")
  159. .set_name("real number with scientific notation")
  160. .set_parse_action(convert_to_float)
  161. )
  162. """expression that parses a floating point number with optional
  163. scientific notation and returns a float"""
  164. # streamlining this expression makes the docs nicer-looking
  165. number = (sci_real | real | signed_integer).setName("number").streamline()
  166. """any numeric expression, returns the corresponding Python type"""
  167. fnumber = (
  168. Regex(r"[+-]?\d+\.?\d*([eE][+-]?\d+)?")
  169. .set_name("fnumber")
  170. .set_parse_action(convert_to_float)
  171. )
  172. """any int or real number, returned as float"""
  173. identifier = Word(identchars, identbodychars).set_name("identifier")
  174. """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
  175. ipv4_address = Regex(
  176. r"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}"
  177. ).set_name("IPv4 address")
  178. "IPv4 address (``0.0.0.0 - 255.255.255.255``)"
  179. _ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer")
  180. _full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name(
  181. "full IPv6 address"
  182. )
  183. _short_ipv6_address = (
  184. Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
  185. + "::"
  186. + Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
  187. ).set_name("short IPv6 address")
  188. _short_ipv6_address.add_condition(
  189. lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8
  190. )
  191. _mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address")
  192. ipv6_address = Combine(
  193. (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name(
  194. "IPv6 address"
  195. )
  196. ).set_name("IPv6 address")
  197. "IPv6 address (long, short, or mixed form)"
  198. mac_address = Regex(
  199. r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}"
  200. ).set_name("MAC address")
  201. "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
  202. @staticmethod
  203. def convert_to_date(fmt: str = "%Y-%m-%d"):
  204. """
  205. Helper to create a parse action for converting parsed date string to Python datetime.date
  206. Params -
  207. - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``)
  208. Example::
  209. date_expr = pyparsing_common.iso8601_date.copy()
  210. date_expr.setParseAction(pyparsing_common.convertToDate())
  211. print(date_expr.parseString("1999-12-31"))
  212. prints::
  213. [datetime.date(1999, 12, 31)]
  214. """
  215. def cvt_fn(ss, ll, tt):
  216. try:
  217. return datetime.strptime(tt[0], fmt).date()
  218. except ValueError as ve:
  219. raise ParseException(ss, ll, str(ve))
  220. return cvt_fn
  221. @staticmethod
  222. def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"):
  223. """Helper to create a parse action for converting parsed
  224. datetime string to Python datetime.datetime
  225. Params -
  226. - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%dT%H:%M:%S.%f"``)
  227. Example::
  228. dt_expr = pyparsing_common.iso8601_datetime.copy()
  229. dt_expr.setParseAction(pyparsing_common.convertToDatetime())
  230. print(dt_expr.parseString("1999-12-31T23:59:59.999"))
  231. prints::
  232. [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
  233. """
  234. def cvt_fn(s, l, t):
  235. try:
  236. return datetime.strptime(t[0], fmt)
  237. except ValueError as ve:
  238. raise ParseException(s, l, str(ve))
  239. return cvt_fn
  240. iso8601_date = Regex(
  241. r"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?"
  242. ).set_name("ISO8601 date")
  243. "ISO8601 date (``yyyy-mm-dd``)"
  244. iso8601_datetime = Regex(
  245. r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?"
  246. ).set_name("ISO8601 datetime")
  247. "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``"
  248. uuid = Regex(r"[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name("UUID")
  249. "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)"
  250. _html_stripper = any_open_tag.suppress() | any_close_tag.suppress()
  251. @staticmethod
  252. def strip_html_tags(s: str, l: int, tokens: ParseResults):
  253. """Parse action to remove HTML tags from web page HTML source
  254. Example::
  255. # strip HTML links from normal text
  256. text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
  257. td, td_end = makeHTMLTags("TD")
  258. table_text = td + SkipTo(td_end).setParseAction(pyparsing_common.stripHTMLTags)("body") + td_end
  259. print(table_text.parseString(text).body)
  260. Prints::
  261. More info at the pyparsing wiki page
  262. """
  263. return pyparsing_common._html_stripper.transform_string(tokens[0])
  264. _commasepitem = (
  265. Combine(
  266. OneOrMore(
  267. ~Literal(",")
  268. + ~LineEnd()
  269. + Word(printables, exclude_chars=",")
  270. + Opt(White(" \t") + ~FollowedBy(LineEnd() | ","))
  271. )
  272. )
  273. .streamline()
  274. .set_name("commaItem")
  275. )
  276. comma_separated_list = delimited_list(
  277. Opt(quoted_string.copy() | _commasepitem, default="")
  278. ).set_name("comma separated list")
  279. """Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
  280. upcase_tokens = staticmethod(token_map(lambda t: t.upper()))
  281. """Parse action to convert tokens to upper case."""
  282. downcase_tokens = staticmethod(token_map(lambda t: t.lower()))
  283. """Parse action to convert tokens to lower case."""
  284. # fmt: off
  285. url = Regex(
  286. # https://mathiasbynens.be/demo/url-regex
  287. # https://gist.github.com/dperini/729294
  288. r"^" +
  289. # protocol identifier (optional)
  290. # short syntax // still required
  291. r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" +
  292. # user:pass BasicAuth (optional)
  293. r"(?:(?P<auth>\S+(?::\S*)?)@)?" +
  294. r"(?P<host>" +
  295. # IP address exclusion
  296. # private & local networks
  297. r"(?!(?:10|127)(?:\.\d{1,3}){3})" +
  298. r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" +
  299. r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" +
  300. # IP address dotted notation octets
  301. # excludes loopback network 0.0.0.0
  302. # excludes reserved space >= 224.0.0.0
  303. # excludes network & broadcast addresses
  304. # (first & last IP address of each class)
  305. r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" +
  306. r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" +
  307. r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +
  308. r"|" +
  309. # host & domain names, may end with dot
  310. # can be replaced by a shortest alternative
  311. # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
  312. r"(?:" +
  313. r"(?:" +
  314. r"[a-z0-9\u00a1-\uffff]" +
  315. r"[a-z0-9\u00a1-\uffff_-]{0,62}" +
  316. r")?" +
  317. r"[a-z0-9\u00a1-\uffff]\." +
  318. r")+" +
  319. # TLD identifier name, may end with dot
  320. r"(?:[a-z\u00a1-\uffff]{2,}\.?)" +
  321. r")" +
  322. # port number (optional)
  323. r"(:(?P<port>\d{2,5}))?" +
  324. # resource path (optional)
  325. r"(?P<path>\/[^?# ]*)?" +
  326. # query string (optional)
  327. r"(\?(?P<query>[^#]*))?" +
  328. # fragment (optional)
  329. r"(#(?P<fragment>\S*))?" +
  330. r"$"
  331. ).set_name("url")
  332. # fmt: on
  333. # pre-PEP8 compatibility names
  334. convertToInteger = convert_to_integer
  335. convertToFloat = convert_to_float
  336. convertToDate = convert_to_date
  337. convertToDatetime = convert_to_datetime
  338. stripHTMLTags = strip_html_tags
  339. upcaseTokens = upcase_tokens
  340. downcaseTokens = downcase_tokens
  341. _builtin_exprs = [
  342. v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement)
  343. ]