1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069 |
- # helpers.py
- import html.entities
- import re
- from . import __diag__
- from .core import *
- from .util import _bslash, _flatten, _escape_regex_range_chars
- #
- # global helpers
- #
- def delimited_list(
- expr: Union[str, ParserElement],
- delim: Union[str, ParserElement] = ",",
- combine: bool = False,
- min: OptionalType[int] = None,
- max: OptionalType[int] = None,
- *,
- allow_trailing_delim: bool = False,
- ) -> ParserElement:
- """Helper to define a delimited list of expressions - the delimiter
- defaults to ','. By default, the list elements and delimiters can
- have intervening whitespace, and comments, but this can be
- overridden by passing ``combine=True`` in the constructor. If
- ``combine`` is set to ``True``, the matching tokens are
- returned as a single token string, with the delimiters included;
- otherwise, the matching tokens are returned as a list of tokens,
- with the delimiters suppressed.
- If ``allow_trailing_delim`` is set to True, then the list may end with
- a delimiter.
- Example::
- delimited_list(Word(alphas)).parse_string("aa,bb,cc") # -> ['aa', 'bb', 'cc']
- delimited_list(Word(hexnums), delim=':', combine=True).parse_string("AA:BB:CC:DD:EE") # -> ['AA:BB:CC:DD:EE']
- """
- if isinstance(expr, str_type):
- expr = ParserElement._literalStringClass(expr)
- dlName = "{expr} [{delim} {expr}]...{end}".format(
- expr=str(expr.copy().streamline()),
- delim=str(delim),
- end=" [{}]".format(str(delim)) if allow_trailing_delim else "",
- )
- if not combine:
- delim = Suppress(delim)
- if min is not None:
- if min < 1:
- raise ValueError("min must be greater than 0")
- min -= 1
- if max is not None:
- if min is not None and max <= min:
- raise ValueError("max must be greater than, or equal to min")
- max -= 1
- delimited_list_expr = expr + (delim + expr)[min, max]
- if allow_trailing_delim:
- delimited_list_expr += Opt(delim)
- if combine:
- return Combine(delimited_list_expr).set_name(dlName)
- else:
- return delimited_list_expr.set_name(dlName)
- def counted_array(
- expr: ParserElement,
- int_expr: OptionalType[ParserElement] = None,
- *,
- intExpr: OptionalType[ParserElement] = None,
- ) -> ParserElement:
- """Helper to define a counted list of expressions.
- This helper defines a pattern of the form::
- integer expr expr expr...
- where the leading integer tells how many expr expressions follow.
- The matched tokens returns the array of expr tokens as a list - the
- leading count token is suppressed.
- If ``int_expr`` is specified, it should be a pyparsing expression
- that produces an integer value.
- Example::
- counted_array(Word(alphas)).parse_string('2 ab cd ef') # -> ['ab', 'cd']
- # in this parser, the leading integer value is given in binary,
- # '10' indicating that 2 values are in the array
- binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
- counted_array(Word(alphas), int_expr=binary_constant).parse_string('10 ab cd ef') # -> ['ab', 'cd']
- # if other fields must be parsed after the count but before the
- # list items, give the fields results names and they will
- # be preserved in the returned ParseResults:
- count_with_metadata = integer + Word(alphas)("type")
- typed_array = counted_array(Word(alphanums), int_expr=count_with_metadata)("items")
- result = typed_array.parse_string("3 bool True True False")
- print(result.dump())
- # prints
- # ['True', 'True', 'False']
- # - items: ['True', 'True', 'False']
- # - type: 'bool'
- """
- intExpr = intExpr or int_expr
- array_expr = Forward()
- def count_field_parse_action(s, l, t):
- nonlocal array_expr
- n = t[0]
- array_expr <<= (expr * n) if n else Empty()
- # clear list contents, but keep any named results
- del t[:]
- if intExpr is None:
- intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
- else:
- intExpr = intExpr.copy()
- intExpr.set_name("arrayLen")
- intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
- return (intExpr + array_expr).set_name("(len) " + str(expr) + "...")
- def match_previous_literal(expr: ParserElement) -> ParserElement:
- """Helper to define an expression that is indirectly defined from
- the tokens matched in a previous expression, that is, it looks for
- a 'repeat' of a previous expression. For example::
- first = Word(nums)
- second = match_previous_literal(first)
- match_expr = first + ":" + second
- will match ``"1:1"``, but not ``"1:2"``. Because this
- matches a previous literal, will also match the leading
- ``"1:1"`` in ``"1:10"``. If this is not desired, use
- :class:`match_previous_expr`. Do *not* use with packrat parsing
- enabled.
- """
- rep = Forward()
- def copy_token_to_repeater(s, l, t):
- if t:
- if len(t) == 1:
- rep << t[0]
- else:
- # flatten t tokens
- tflat = _flatten(t.as_list())
- rep << And(Literal(tt) for tt in tflat)
- else:
- rep << Empty()
- expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
- rep.set_name("(prev) " + str(expr))
- return rep
- def match_previous_expr(expr: ParserElement) -> ParserElement:
- """Helper to define an expression that is indirectly defined from
- the tokens matched in a previous expression, that is, it looks for
- a 'repeat' of a previous expression. For example::
- first = Word(nums)
- second = match_previous_expr(first)
- match_expr = first + ":" + second
- will match ``"1:1"``, but not ``"1:2"``. Because this
- matches by expressions, will *not* match the leading ``"1:1"``
- in ``"1:10"``; the expressions are evaluated first, and then
- compared, so ``"1"`` is compared with ``"10"``. Do *not* use
- with packrat parsing enabled.
- """
- rep = Forward()
- e2 = expr.copy()
- rep <<= e2
- def copy_token_to_repeater(s, l, t):
- matchTokens = _flatten(t.as_list())
- def must_match_these_tokens(s, l, t):
- theseTokens = _flatten(t.as_list())
- if theseTokens != matchTokens:
- raise ParseException(s, l, "Expected {}, found{}".format(matchTokens, theseTokens))
- rep.set_parse_action(must_match_these_tokens, callDuringTry=True)
- expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
- rep.set_name("(prev) " + str(expr))
- return rep
- def one_of(
- strs: Union[IterableType[str], str],
- caseless: bool = False,
- use_regex: bool = True,
- as_keyword: bool = False,
- *,
- useRegex: bool = True,
- asKeyword: bool = False,
- ) -> ParserElement:
- """Helper to quickly define a set of alternative :class:`Literal` s,
- and makes sure to do longest-first testing when there is a conflict,
- regardless of the input order, but returns
- a :class:`MatchFirst` for best performance.
- Parameters:
- - ``strs`` - a string of space-delimited literals, or a collection of
- string literals
- - ``caseless`` - treat all literals as caseless - (default= ``False``)
- - ``use_regex`` - as an optimization, will
- generate a :class:`Regex` object; otherwise, will generate
- a :class:`MatchFirst` object (if ``caseless=True`` or ``asKeyword=True``, or if
- creating a :class:`Regex` raises an exception) - (default= ``True``)
- - ``as_keyword`` - enforce :class:`Keyword`-style matching on the
- generated expressions - (default= ``False``)
- - ``asKeyword`` and ``useRegex`` are retained for pre-PEP8 compatibility,
- but will be removed in a future release
- Example::
- comp_oper = one_of("< = > <= >= !=")
- var = Word(alphas)
- number = Word(nums)
- term = var | number
- comparison_expr = term + comp_oper + term
- print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12"))
- prints::
- [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
- """
- asKeyword = asKeyword or as_keyword
- useRegex = useRegex and use_regex
- if (
- isinstance(caseless, str_type)
- and __diag__.warn_on_multiple_string_args_to_oneof
- ):
- warnings.warn(
- "More than one string argument passed to one_of, pass"
- " choices as a list or space-delimited string",
- stacklevel=2,
- )
- if caseless:
- isequal = lambda a, b: a.upper() == b.upper()
- masks = lambda a, b: b.upper().startswith(a.upper())
- parseElementClass = CaselessKeyword if asKeyword else CaselessLiteral
- else:
- isequal = lambda a, b: a == b
- masks = lambda a, b: b.startswith(a)
- parseElementClass = Keyword if asKeyword else Literal
- symbols: List[str] = []
- if isinstance(strs, str_type):
- symbols = strs.split()
- elif isinstance(strs, Iterable):
- symbols = list(strs)
- else:
- raise TypeError("Invalid argument to one_of, expected string or iterable")
- if not symbols:
- return NoMatch()
- # reorder given symbols to take care to avoid masking longer choices with shorter ones
- # (but only if the given symbols are not just single characters)
- if any(len(sym) > 1 for sym in symbols):
- i = 0
- while i < len(symbols) - 1:
- cur = symbols[i]
- for j, other in enumerate(symbols[i + 1 :]):
- if isequal(other, cur):
- del symbols[i + j + 1]
- break
- elif masks(cur, other):
- del symbols[i + j + 1]
- symbols.insert(i, other)
- break
- else:
- i += 1
- if useRegex:
- re_flags: int = re.IGNORECASE if caseless else 0
- try:
- if all(len(sym) == 1 for sym in symbols):
- # symbols are just single characters, create range regex pattern
- patt = "[{}]".format(
- "".join(_escape_regex_range_chars(sym) for sym in symbols)
- )
- else:
- patt = "|".join(re.escape(sym) for sym in symbols)
- # wrap with \b word break markers if defining as keywords
- if asKeyword:
- patt = r"\b(?:{})\b".format(patt)
- ret = Regex(patt, flags=re_flags).set_name(" | ".join(symbols))
- if caseless:
- # add parse action to return symbols as specified, not in random
- # casing as found in input string
- symbol_map = {sym.lower(): sym for sym in symbols}
- ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
- return ret
- except sre_constants.error:
- warnings.warn(
- "Exception creating Regex for one_of, building MatchFirst", stacklevel=2
- )
- # last resort, just use MatchFirst
- return MatchFirst(parseElementClass(sym) for sym in symbols).set_name(
- " | ".join(symbols)
- )
- def dict_of(key: ParserElement, value: ParserElement) -> ParserElement:
- """Helper to easily and clearly define a dictionary by specifying
- the respective patterns for the key and value. Takes care of
- defining the :class:`Dict`, :class:`ZeroOrMore`, and
- :class:`Group` tokens in the proper order. The key pattern
- can include delimiting markers or punctuation, as long as they are
- suppressed, thereby leaving the significant key text. The value
- pattern can include named results, so that the :class:`Dict` results
- can include named token fields.
- Example::
- text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
- attr_expr = (label + Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join))
- print(OneOrMore(attr_expr).parse_string(text).dump())
- attr_label = label
- attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join)
- # similar to Dict, but simpler call format
- result = dict_of(attr_label, attr_value).parse_string(text)
- print(result.dump())
- print(result['shape'])
- print(result.shape) # object attribute access works too
- print(result.as_dict())
- prints::
- [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
- - color: light blue
- - posn: upper left
- - shape: SQUARE
- - texture: burlap
- SQUARE
- SQUARE
- {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'}
- """
- return Dict(OneOrMore(Group(key + value)))
- def original_text_for(
- expr: ParserElement, as_string: bool = True, *, asString: bool = True
- ) -> ParserElement:
- """Helper to return the original, untokenized text for a given
- expression. Useful to restore the parsed fields of an HTML start
- tag into the raw tag text itself, or to revert separate tokens with
- intervening whitespace back to the original matching input text. By
- default, returns astring containing the original parsed text.
- If the optional ``as_string`` argument is passed as
- ``False``, then the return value is
- a :class:`ParseResults` containing any results names that
- were originally matched, and a single token containing the original
- matched text from the input string. So if the expression passed to
- :class:`original_text_for` contains expressions with defined
- results names, you must set ``as_string`` to ``False`` if you
- want to preserve those results name values.
- The ``asString`` pre-PEP8 argument is retained for compatibility,
- but will be removed in a future release.
- Example::
- src = "this is test <b> bold <i>text</i> </b> normal text "
- for tag in ("b", "i"):
- opener, closer = make_html_tags(tag)
- patt = original_text_for(opener + SkipTo(closer) + closer)
- print(patt.search_string(src)[0])
- prints::
- ['<b> bold <i>text</i> </b>']
- ['<i>text</i>']
- """
- asString = asString and as_string
- locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
- endlocMarker = locMarker.copy()
- endlocMarker.callPreparse = False
- matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
- if asString:
- extractText = lambda s, l, t: s[t._original_start : t._original_end]
- else:
- def extractText(s, l, t):
- t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
- matchExpr.set_parse_action(extractText)
- matchExpr.ignoreExprs = expr.ignoreExprs
- matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
- return matchExpr
- def ungroup(expr: ParserElement) -> ParserElement:
- """Helper to undo pyparsing's default grouping of And expressions,
- even if all but one are non-empty.
- """
- return TokenConverter(expr).add_parse_action(lambda t: t[0])
- def locatedExpr(expr: ParserElement) -> ParserElement:
- """
- (DEPRECATED - future code should use the Located class)
- Helper to decorate a returned token with its starting and ending
- locations in the input string.
- This helper adds the following results names:
- - ``locn_start`` - location where matched expression begins
- - ``locn_end`` - location where matched expression ends
- - ``value`` - the actual parsed results
- Be careful if the input text contains ``<TAB>`` characters, you
- may want to call :class:`ParserElement.parseWithTabs`
- Example::
- wd = Word(alphas)
- for match in locatedExpr(wd).searchString("ljsdf123lksdjjf123lkkjj1222"):
- print(match)
- prints::
- [[0, 'ljsdf', 5]]
- [[8, 'lksdjjf', 15]]
- [[18, 'lkkjj', 23]]
- """
- locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
- return Group(
- locator("locn_start")
- + expr("value")
- + locator.copy().leaveWhitespace()("locn_end")
- )
- def nested_expr(
- opener: Union[str, ParserElement] = "(",
- closer: Union[str, ParserElement] = ")",
- content: OptionalType[ParserElement] = None,
- ignore_expr: ParserElement = quoted_string(),
- *,
- ignoreExpr: ParserElement = quoted_string(),
- ) -> ParserElement:
- """Helper method for defining nested lists enclosed in opening and
- closing delimiters (``"("`` and ``")"`` are the default).
- Parameters:
- - ``opener`` - opening character for a nested list
- (default= ``"("``); can also be a pyparsing expression
- - ``closer`` - closing character for a nested list
- (default= ``")"``); can also be a pyparsing expression
- - ``content`` - expression for items within the nested lists
- (default= ``None``)
- - ``ignore_expr`` - expression for ignoring opening and closing delimiters
- (default= :class:`quoted_string`)
- - ``ignoreExpr`` - this pre-PEP8 argument is retained for compatibility
- but will be removed in a future release
- If an expression is not provided for the content argument, the
- nested expression will capture all whitespace-delimited content
- between delimiters as a list of separate values.
- Use the ``ignore_expr`` argument to define expressions that may
- contain opening or closing characters that should not be treated as
- opening or closing characters for nesting, such as quoted_string or
- a comment expression. Specify multiple expressions using an
- :class:`Or` or :class:`MatchFirst`. The default is
- :class:`quoted_string`, but if no expressions are to be ignored, then
- pass ``None`` for this argument.
- Example::
- data_type = one_of("void int short long char float double")
- decl_data_type = Combine(data_type + Opt(Word('*')))
- ident = Word(alphas+'_', alphanums+'_')
- number = pyparsing_common.number
- arg = Group(decl_data_type + ident)
- LPAR, RPAR = map(Suppress, "()")
- code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
- c_function = (decl_data_type("type")
- + ident("name")
- + LPAR + Opt(delimited_list(arg), [])("args") + RPAR
- + code_body("body"))
- c_function.ignore(c_style_comment)
- source_code = '''
- int is_odd(int x) {
- return (x%2);
- }
- int dec_to_hex(char hchar) {
- if (hchar >= '0' && hchar <= '9') {
- return (ord(hchar)-ord('0'));
- } else {
- return (10+ord(hchar)-ord('A'));
- }
- }
- '''
- for func in c_function.search_string(source_code):
- print("%(name)s (%(type)s) args: %(args)s" % func)
- prints::
- is_odd (int) args: [['int', 'x']]
- dec_to_hex (int) args: [['char', 'hchar']]
- """
- if ignoreExpr != ignore_expr:
- ignoreExpr = ignore_expr if ignoreExpr == quoted_string() else ignoreExpr
- if opener == closer:
- raise ValueError("opening and closing strings cannot be the same")
- if content is None:
- if isinstance(opener, str_type) and isinstance(closer, str_type):
- if len(opener) == 1 and len(closer) == 1:
- if ignoreExpr is not None:
- content = Combine(
- OneOrMore(
- ~ignoreExpr
- + CharsNotIn(
- opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
- exact=1,
- )
- )
- ).set_parse_action(lambda t: t[0].strip())
- else:
- content = empty.copy() + CharsNotIn(
- opener + closer + ParserElement.DEFAULT_WHITE_CHARS
- ).set_parse_action(lambda t: t[0].strip())
- else:
- if ignoreExpr is not None:
- content = Combine(
- OneOrMore(
- ~ignoreExpr
- + ~Literal(opener)
- + ~Literal(closer)
- + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
- )
- ).set_parse_action(lambda t: t[0].strip())
- else:
- content = Combine(
- OneOrMore(
- ~Literal(opener)
- + ~Literal(closer)
- + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
- )
- ).set_parse_action(lambda t: t[0].strip())
- else:
- raise ValueError(
- "opening and closing arguments must be strings if no content expression is given"
- )
- ret = Forward()
- if ignoreExpr is not None:
- ret <<= Group(
- Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
- )
- else:
- ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
- ret.set_name("nested %s%s expression" % (opener, closer))
- return ret
- def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
- """Internal helper to construct opening and closing tag expressions, given a tag name"""
- if isinstance(tagStr, str_type):
- resname = tagStr
- tagStr = Keyword(tagStr, caseless=not xml)
- else:
- resname = tagStr.name
- tagAttrName = Word(alphas, alphanums + "_-:")
- if xml:
- tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
- openTag = (
- suppress_LT
- + tagStr("tag")
- + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
- + Opt("/", default=[False])("empty").set_parse_action(
- lambda s, l, t: t[0] == "/"
- )
- + suppress_GT
- )
- else:
- tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
- printables, exclude_chars=">"
- )
- openTag = (
- suppress_LT
- + tagStr("tag")
- + Dict(
- ZeroOrMore(
- Group(
- tagAttrName.set_parse_action(lambda t: t[0].lower())
- + Opt(Suppress("=") + tagAttrValue)
- )
- )
- )
- + Opt("/", default=[False])("empty").set_parse_action(
- lambda s, l, t: t[0] == "/"
- )
- + suppress_GT
- )
- closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
- openTag.set_name("<%s>" % resname)
- # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
- openTag.add_parse_action(
- lambda t: t.__setitem__(
- "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
- )
- )
- closeTag = closeTag(
- "end" + "".join(resname.replace(":", " ").title().split())
- ).set_name("</%s>" % resname)
- openTag.tag = resname
- closeTag.tag = resname
- openTag.tag_body = SkipTo(closeTag())
- return openTag, closeTag
- def make_html_tags(
- tag_str: Union[str, ParserElement]
- ) -> Tuple[ParserElement, ParserElement]:
- """Helper to construct opening and closing tag expressions for HTML,
- given a tag name. Matches tags in either upper or lower case,
- attributes with namespaces and with quoted or unquoted values.
- Example::
- text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
- # make_html_tags returns pyparsing expressions for the opening and
- # closing tags as a 2-tuple
- a, a_end = make_html_tags("A")
- link_expr = a + SkipTo(a_end)("link_text") + a_end
- for link in link_expr.search_string(text):
- # attributes in the <A> tag (like "href" shown here) are
- # also accessible as named results
- print(link.link_text, '->', link.href)
- prints::
- pyparsing -> https://github.com/pyparsing/pyparsing/wiki
- """
- return _makeTags(tag_str, False)
- def make_xml_tags(
- tag_str: Union[str, ParserElement]
- ) -> Tuple[ParserElement, ParserElement]:
- """Helper to construct opening and closing tag expressions for XML,
- given a tag name. Matches tags only in the given upper/lower case.
- Example: similar to :class:`make_html_tags`
- """
- return _makeTags(tag_str, True)
- any_open_tag, any_close_tag = make_html_tags(
- Word(alphas, alphanums + "_:").set_name("any tag")
- )
- _htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
- common_html_entity = Regex("&(?P<entity>" + "|".join(_htmlEntityMap) + ");").set_name(
- "common HTML entity"
- )
- def replace_html_entity(t):
- """Helper parser action to replace common HTML entities with their special characters"""
- return _htmlEntityMap.get(t.entity)
- class OpAssoc(Enum):
- LEFT = 1
- RIGHT = 2
- InfixNotationOperatorArgType = Union[
- ParserElement, str, Tuple[Union[ParserElement, str], Union[ParserElement, str]]
- ]
- InfixNotationOperatorSpec = Union[
- Tuple[
- InfixNotationOperatorArgType,
- int,
- OpAssoc,
- OptionalType[ParseAction],
- ],
- Tuple[
- InfixNotationOperatorArgType,
- int,
- OpAssoc,
- ],
- ]
- def infix_notation(
- base_expr: ParserElement,
- op_list: List[InfixNotationOperatorSpec],
- lpar: Union[str, ParserElement] = Suppress("("),
- rpar: Union[str, ParserElement] = Suppress(")"),
- ) -> ParserElement:
- """Helper method for constructing grammars of expressions made up of
- operators working in a precedence hierarchy. Operators may be unary
- or binary, left- or right-associative. Parse actions can also be
- attached to operator expressions. The generated parser will also
- recognize the use of parentheses to override operator precedences
- (see example below).
- Note: if you define a deep operator list, you may see performance
- issues when using infix_notation. See
- :class:`ParserElement.enable_packrat` for a mechanism to potentially
- improve your parser performance.
- Parameters:
- - ``base_expr`` - expression representing the most basic operand to
- be used in the expression
- - ``op_list`` - list of tuples, one for each operator precedence level
- in the expression grammar; each tuple is of the form ``(op_expr,
- num_operands, right_left_assoc, (optional)parse_action)``, where:
- - ``op_expr`` is the pyparsing expression for the operator; may also
- be a string, which will be converted to a Literal; if ``num_operands``
- is 3, ``op_expr`` is a tuple of two expressions, for the two
- operators separating the 3 terms
- - ``num_operands`` is the number of terms for this operator (must be 1,
- 2, or 3)
- - ``right_left_assoc`` is the indicator whether the operator is right
- or left associative, using the pyparsing-defined constants
- ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
- - ``parse_action`` is the parse action to be associated with
- expressions matching this operator expression (the parse action
- tuple member may be omitted); if the parse action is passed
- a tuple or list of functions, this is equivalent to calling
- ``set_parse_action(*fn)``
- (:class:`ParserElement.set_parse_action`)
- - ``lpar`` - expression for matching left-parentheses
- (default= ``Suppress('(')``)
- - ``rpar`` - expression for matching right-parentheses
- (default= ``Suppress(')')``)
- Example::
- # simple example of four-function arithmetic with ints and
- # variable names
- integer = pyparsing_common.signed_integer
- varname = pyparsing_common.identifier
- arith_expr = infix_notation(integer | varname,
- [
- ('-', 1, OpAssoc.RIGHT),
- (one_of('* /'), 2, OpAssoc.LEFT),
- (one_of('+ -'), 2, OpAssoc.LEFT),
- ])
- arith_expr.run_tests('''
- 5+3*6
- (5+3)*6
- -2--11
- ''', full_dump=False)
- prints::
- 5+3*6
- [[5, '+', [3, '*', 6]]]
- (5+3)*6
- [[[5, '+', 3], '*', 6]]
- -2--11
- [[['-', 2], '-', ['-', 11]]]
- """
- # captive version of FollowedBy that does not do parse actions or capture results names
- class _FB(FollowedBy):
- def parseImpl(self, instring, loc, doActions=True):
- self.expr.try_parse(instring, loc)
- return loc, []
- _FB.__name__ = "FollowedBy>"
- ret = Forward()
- lpar = Suppress(lpar)
- rpar = Suppress(rpar)
- lastExpr = base_expr | (lpar + ret + rpar)
- for i, operDef in enumerate(op_list):
- opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4]
- if isinstance(opExpr, str_type):
- opExpr = ParserElement._literalStringClass(opExpr)
- if arity == 3:
- if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
- raise ValueError(
- "if numterms=3, opExpr must be a tuple or list of two expressions"
- )
- opExpr1, opExpr2 = opExpr
- term_name = "{}{} term".format(opExpr1, opExpr2)
- else:
- term_name = "{} term".format(opExpr)
- if not 1 <= arity <= 3:
- raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
- if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
- raise ValueError("operator must indicate right or left associativity")
- thisExpr = Forward().set_name(term_name)
- if rightLeftAssoc is OpAssoc.LEFT:
- if arity == 1:
- matchExpr = _FB(lastExpr + opExpr) + Group(lastExpr + opExpr[1, ...])
- elif arity == 2:
- if opExpr is not None:
- matchExpr = _FB(lastExpr + opExpr + lastExpr) + Group(
- lastExpr + (opExpr + lastExpr)[1, ...]
- )
- else:
- matchExpr = _FB(lastExpr + lastExpr) + Group(lastExpr[2, ...])
- elif arity == 3:
- matchExpr = _FB(
- lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
- ) + Group(lastExpr + OneOrMore(opExpr1 + lastExpr + opExpr2 + lastExpr))
- elif rightLeftAssoc is OpAssoc.RIGHT:
- if arity == 1:
- # try to avoid LR with this extra test
- if not isinstance(opExpr, Opt):
- opExpr = Opt(opExpr)
- matchExpr = _FB(opExpr.expr + thisExpr) + Group(opExpr + thisExpr)
- elif arity == 2:
- if opExpr is not None:
- matchExpr = _FB(lastExpr + opExpr + thisExpr) + Group(
- lastExpr + (opExpr + thisExpr)[1, ...]
- )
- else:
- matchExpr = _FB(lastExpr + thisExpr) + Group(
- lastExpr + thisExpr[1, ...]
- )
- elif arity == 3:
- matchExpr = _FB(
- lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
- ) + Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
- if pa:
- if isinstance(pa, (tuple, list)):
- matchExpr.set_parse_action(*pa)
- else:
- matchExpr.set_parse_action(pa)
- thisExpr <<= (matchExpr | lastExpr).setName(term_name)
- lastExpr = thisExpr
- ret <<= lastExpr
- return ret
- def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
- """
- (DEPRECATED - use IndentedBlock class instead)
- Helper method for defining space-delimited indentation blocks,
- such as those used to define block statements in Python source code.
- Parameters:
- - ``blockStatementExpr`` - expression defining syntax of statement that
- is repeated within the indented block
- - ``indentStack`` - list created by caller to manage indentation stack
- (multiple ``statementWithIndentedBlock`` expressions within a single
- grammar should share a common ``indentStack``)
- - ``indent`` - boolean indicating whether block must be indented beyond
- the current level; set to ``False`` for block of left-most statements
- (default= ``True``)
- A valid block must contain at least one ``blockStatement``.
- (Note that indentedBlock uses internal parse actions which make it
- incompatible with packrat parsing.)
- Example::
- data = '''
- def A(z):
- A1
- B = 100
- G = A2
- A2
- A3
- B
- def BB(a,b,c):
- BB1
- def BBA():
- bba1
- bba2
- bba3
- C
- D
- def spam(x,y):
- def eggs(z):
- pass
- '''
- indentStack = [1]
- stmt = Forward()
- identifier = Word(alphas, alphanums)
- funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
- func_body = indentedBlock(stmt, indentStack)
- funcDef = Group(funcDecl + func_body)
- rvalue = Forward()
- funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
- rvalue << (funcCall | identifier | Word(nums))
- assignment = Group(identifier + "=" + rvalue)
- stmt << (funcDef | assignment | identifier)
- module_body = OneOrMore(stmt)
- parseTree = module_body.parseString(data)
- parseTree.pprint()
- prints::
- [['def',
- 'A',
- ['(', 'z', ')'],
- ':',
- [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
- 'B',
- ['def',
- 'BB',
- ['(', 'a', 'b', 'c', ')'],
- ':',
- [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
- 'C',
- 'D',
- ['def',
- 'spam',
- ['(', 'x', 'y', ')'],
- ':',
- [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
- """
- backup_stacks.append(indentStack[:])
- def reset_stack():
- indentStack[:] = backup_stacks[-1]
- def checkPeerIndent(s, l, t):
- if l >= len(s):
- return
- curCol = col(l, s)
- if curCol != indentStack[-1]:
- if curCol > indentStack[-1]:
- raise ParseException(s, l, "illegal nesting")
- raise ParseException(s, l, "not a peer entry")
- def checkSubIndent(s, l, t):
- curCol = col(l, s)
- if curCol > indentStack[-1]:
- indentStack.append(curCol)
- else:
- raise ParseException(s, l, "not a subentry")
- def checkUnindent(s, l, t):
- if l >= len(s):
- return
- curCol = col(l, s)
- if not (indentStack and curCol in indentStack):
- raise ParseException(s, l, "not an unindent")
- if curCol < indentStack[-1]:
- indentStack.pop()
- NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
- INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
- PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
- UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
- if indent:
- smExpr = Group(
- Opt(NL)
- + INDENT
- + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
- + UNDENT
- )
- else:
- smExpr = Group(
- Opt(NL)
- + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
- + Opt(UNDENT)
- )
- # add a parse action to remove backup_stack from list of backups
- smExpr.add_parse_action(
- lambda: backup_stacks.pop(-1) and None if backup_stacks else None
- )
- smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
- blockStatementExpr.ignore(_bslash + LineEnd())
- return smExpr.set_name("indented block")
- # it's easy to get these comment structures wrong - they're very common, so may as well make them available
- c_style_comment = Combine(Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/").set_name(
- "C style comment"
- )
- "Comment of the form ``/* ... */``"
- html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
- "Comment of the form ``<!-- ... -->``"
- rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
- dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
- "Comment of the form ``// ... (to end of line)``"
- cpp_style_comment = Combine(
- Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/" | dbl_slash_comment
- ).set_name("C++ style comment")
- "Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
- java_style_comment = cpp_style_comment
- "Same as :class:`cpp_style_comment`"
- python_style_comment = Regex(r"#.*").set_name("Python style comment")
- "Comment of the form ``# ... (to end of line)``"
- # build list of built-in expressions, for future reference if a global default value
- # gets updated
- _builtin_exprs = [v for v in vars().values() if isinstance(v, ParserElement)]
- # pre-PEP8 compatible names
- delimitedList = delimited_list
- countedArray = counted_array
- matchPreviousLiteral = match_previous_literal
- matchPreviousExpr = match_previous_expr
- oneOf = one_of
- dictOf = dict_of
- originalTextFor = original_text_for
- nestedExpr = nested_expr
- makeHTMLTags = make_html_tags
- makeXMLTags = make_xml_tags
- anyOpenTag, anyCloseTag = any_open_tag, any_close_tag
- commonHTMLEntity = common_html_entity
- replaceHTMLEntity = replace_html_entity
- opAssoc = OpAssoc
- infixNotation = infix_notation
- cStyleComment = c_style_comment
- htmlComment = html_comment
- restOfLine = rest_of_line
- dblSlashComment = dbl_slash_comment
- cppStyleComment = cpp_style_comment
- javaStyleComment = java_style_comment
- pythonStyleComment = python_style_comment
|