12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- """
- pygments.regexopt
- ~~~~~~~~~~~~~~~~~
- An algorithm that generates optimized regexes for matching long lists of
- literal strings.
- :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
- :license: BSD, see LICENSE for details.
- """
- import re
- from re import escape
- from os.path import commonprefix
- from itertools import groupby
- from operator import itemgetter
- CS_ESCAPE = re.compile(r'[\[\^\\\-\]]')
- FIRST_ELEMENT = itemgetter(0)
- def make_charset(letters):
- return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
- def regex_opt_inner(strings, open_paren):
- """Return a regex that matches any string in the sorted list of strings."""
- close_paren = open_paren and ')' or ''
- # print strings, repr(open_paren)
- if not strings:
- # print '-> nothing left'
- return ''
- first = strings[0]
- if len(strings) == 1:
- # print '-> only 1 string'
- return open_paren + escape(first) + close_paren
- if not first:
- # print '-> first string empty'
- return open_paren + regex_opt_inner(strings[1:], '(?:') \
- + '?' + close_paren
- if len(first) == 1:
- # multiple one-char strings? make a charset
- oneletter = []
- rest = []
- for s in strings:
- if len(s) == 1:
- oneletter.append(s)
- else:
- rest.append(s)
- if len(oneletter) > 1: # do we have more than one oneletter string?
- if rest:
- # print '-> 1-character + rest'
- return open_paren + regex_opt_inner(rest, '') + '|' \
- + make_charset(oneletter) + close_paren
- # print '-> only 1-character'
- return open_paren + make_charset(oneletter) + close_paren
- prefix = commonprefix(strings)
- if prefix:
- plen = len(prefix)
- # we have a prefix for all strings
- # print '-> prefix:', prefix
- return open_paren + escape(prefix) \
- + regex_opt_inner([s[plen:] for s in strings], '(?:') \
- + close_paren
- # is there a suffix?
- strings_rev = [s[::-1] for s in strings]
- suffix = commonprefix(strings_rev)
- if suffix:
- slen = len(suffix)
- # print '-> suffix:', suffix[::-1]
- return open_paren \
- + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
- + escape(suffix[::-1]) + close_paren
- # recurse on common 1-string prefixes
- # print '-> last resort'
- return open_paren + \
- '|'.join(regex_opt_inner(list(group[1]), '')
- for group in groupby(strings, lambda s: s[0] == first[0])) \
- + close_paren
- def regex_opt(strings, prefix='', suffix=''):
- """Return a compiled regex that matches any string in the given list.
- The strings to match must be literal strings, not regexes. They will be
- regex-escaped.
- *prefix* and *suffix* are pre- and appended to the final regex.
- """
- strings = sorted(strings)
- return prefix + regex_opt_inner(strings, '(') + suffix
|