textfmts.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. """
  2. pygments.lexers.textfmts
  3. ~~~~~~~~~~~~~~~~~~~~~~~~
  4. Lexers for various text formats.
  5. :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. from pygments.lexers import guess_lexer, get_lexer_by_name
  10. from pygments.lexer import RegexLexer, bygroups, default, include
  11. from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
  12. Number, Generic, Literal, Punctuation
  13. from pygments.util import ClassNotFound
  14. __all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer',
  15. 'NotmuchLexer', 'KernelLogLexer']
  16. class IrcLogsLexer(RegexLexer):
  17. """
  18. Lexer for IRC logs in *irssi*, *xchat* or *weechat* style.
  19. """
  20. name = 'IRC logs'
  21. aliases = ['irc']
  22. filenames = ['*.weechatlog']
  23. mimetypes = ['text/x-irclog']
  24. flags = re.VERBOSE | re.MULTILINE
  25. timestamp = r"""
  26. (
  27. # irssi / xchat and others
  28. (?: \[|\()? # Opening bracket or paren for the timestamp
  29. (?: # Timestamp
  30. (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits
  31. (?:\d{1,4})
  32. [T ])? # Date/time separator: T or space
  33. (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits
  34. (?: \d?\d)
  35. )
  36. (?: \]|\))?\s+ # Closing bracket or paren for the timestamp
  37. |
  38. # weechat
  39. \d{4}\s\w{3}\s\d{2}\s # Date
  40. \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
  41. |
  42. # xchat
  43. \w{3}\s\d{2}\s # Date
  44. \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
  45. )?
  46. """
  47. tokens = {
  48. 'root': [
  49. # log start/end
  50. (r'^\*\*\*\*(.*)\*\*\*\*$', Comment),
  51. # hack
  52. ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)),
  53. # normal msgs
  54. ("^" + timestamp + r"""
  55. (\s*<.*?>\s*) # Nick """,
  56. bygroups(Comment.Preproc, Name.Tag), 'msg'),
  57. # /me msgs
  58. ("^" + timestamp + r"""
  59. (\s*[*]\s+) # Star
  60. (\S+\s+.*?\n) # Nick + rest of message """,
  61. bygroups(Comment.Preproc, Keyword, Generic.Inserted)),
  62. # join/part msgs
  63. ("^" + timestamp + r"""
  64. (\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols
  65. (\S+\s+) # Nick + Space
  66. (.*?\n) # Rest of message """,
  67. bygroups(Comment.Preproc, Keyword, String, Comment)),
  68. (r"^.*?\n", Text),
  69. ],
  70. 'msg': [
  71. (r"\S+:(?!//)", Name.Attribute), # Prefix
  72. (r".*\n", Text, '#pop'),
  73. ],
  74. }
  75. class GettextLexer(RegexLexer):
  76. """
  77. Lexer for Gettext catalog files.
  78. .. versionadded:: 0.9
  79. """
  80. name = 'Gettext Catalog'
  81. aliases = ['pot', 'po']
  82. filenames = ['*.pot', '*.po']
  83. mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext']
  84. tokens = {
  85. 'root': [
  86. (r'^#,\s.*?$', Keyword.Type),
  87. (r'^#:\s.*?$', Keyword.Declaration),
  88. # (r'^#$', Comment),
  89. (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single),
  90. (r'^(")([A-Za-z-]+:)(.*")$',
  91. bygroups(String, Name.Property, String)),
  92. (r'^".*"$', String),
  93. (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$',
  94. bygroups(Name.Variable, Text, String)),
  95. (r'^(msgstr\[)(\d)(\])(\s+)(".*")$',
  96. bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)),
  97. ]
  98. }
  99. class HttpLexer(RegexLexer):
  100. """
  101. Lexer for HTTP sessions.
  102. .. versionadded:: 1.5
  103. """
  104. name = 'HTTP'
  105. aliases = ['http']
  106. flags = re.DOTALL
  107. def get_tokens_unprocessed(self, text, stack=('root',)):
  108. """Reset the content-type state."""
  109. self.content_type = None
  110. return RegexLexer.get_tokens_unprocessed(self, text, stack)
  111. def header_callback(self, match):
  112. if match.group(1).lower() == 'content-type':
  113. content_type = match.group(5).strip()
  114. if ';' in content_type:
  115. content_type = content_type[:content_type.find(';')].strip()
  116. self.content_type = content_type
  117. yield match.start(1), Name.Attribute, match.group(1)
  118. yield match.start(2), Text, match.group(2)
  119. yield match.start(3), Operator, match.group(3)
  120. yield match.start(4), Text, match.group(4)
  121. yield match.start(5), Literal, match.group(5)
  122. yield match.start(6), Text, match.group(6)
  123. def continuous_header_callback(self, match):
  124. yield match.start(1), Text, match.group(1)
  125. yield match.start(2), Literal, match.group(2)
  126. yield match.start(3), Text, match.group(3)
  127. def content_callback(self, match):
  128. content_type = getattr(self, 'content_type', None)
  129. content = match.group()
  130. offset = match.start()
  131. if content_type:
  132. from pygments.lexers import get_lexer_for_mimetype
  133. possible_lexer_mimetypes = [content_type]
  134. if '+' in content_type:
  135. # application/calendar+xml can be treated as application/xml
  136. # if there's not a better match.
  137. general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2',
  138. content_type)
  139. possible_lexer_mimetypes.append(general_type)
  140. for i in possible_lexer_mimetypes:
  141. try:
  142. lexer = get_lexer_for_mimetype(i)
  143. except ClassNotFound:
  144. pass
  145. else:
  146. for idx, token, value in lexer.get_tokens_unprocessed(content):
  147. yield offset + idx, token, value
  148. return
  149. yield offset, Text, content
  150. tokens = {
  151. 'root': [
  152. (r'([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)'
  153. r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)',
  154. bygroups(Name.Function, Text, Name.Namespace, Text,
  155. Keyword.Reserved, Operator, Number, Text),
  156. 'headers'),
  157. (r'(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)',
  158. bygroups(Keyword.Reserved, Operator, Number, Text, Number, Text,
  159. Name.Exception, Text),
  160. 'headers'),
  161. ],
  162. 'headers': [
  163. (r'([^\s:]+)( *)(:)( *)([^\r\n]*)(\r?\n|\Z)', header_callback),
  164. (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback),
  165. (r'\r?\n', Text, 'content')
  166. ],
  167. 'content': [
  168. (r'.+', content_callback)
  169. ]
  170. }
  171. def analyse_text(text):
  172. return any (
  173. re.search(pattern, text) is not None
  174. for pattern in (
  175. r'^([a-zA-Z][-_a-zA-Z]+)( +)([^ ]+)( +)(HTTP)(/)(1\.[01]|2(?:\.0)?|3)(\r?\n|\Z)',
  176. r'^(HTTP)(/)(1\.[01]|2(?:\.0)?|3)( +)(\d{3})(?:( +)([^\r\n]*))?(\r?\n|\Z)',
  177. )
  178. )
  179. class TodotxtLexer(RegexLexer):
  180. """
  181. Lexer for Todo.txt todo list format.
  182. .. versionadded:: 2.0
  183. """
  184. name = 'Todotxt'
  185. url = 'http://todotxt.com/'
  186. aliases = ['todotxt']
  187. # *.todotxt is not a standard extension for Todo.txt files; including it
  188. # makes testing easier, and also makes autodetecting file type easier.
  189. filenames = ['todo.txt', '*.todotxt']
  190. mimetypes = ['text/x-todo']
  191. # Aliases mapping standard token types of Todo.txt format concepts
  192. CompleteTaskText = Operator # Chosen to de-emphasize complete tasks
  193. IncompleteTaskText = Text # Incomplete tasks should look like plain text
  194. # Priority should have most emphasis to indicate importance of tasks
  195. Priority = Generic.Heading
  196. # Dates should have next most emphasis because time is important
  197. Date = Generic.Subheading
  198. # Project and context should have equal weight, and be in different colors
  199. Project = Generic.Error
  200. Context = String
  201. # If tag functionality is added, it should have the same weight as Project
  202. # and Context, and a different color. Generic.Traceback would work well.
  203. # Regex patterns for building up rules; dates, priorities, projects, and
  204. # contexts are all atomic
  205. # TODO: Make date regex more ISO 8601 compliant
  206. date_regex = r'\d{4,}-\d{2}-\d{2}'
  207. priority_regex = r'\([A-Z]\)'
  208. project_regex = r'\+\S+'
  209. context_regex = r'@\S+'
  210. # Compound regex expressions
  211. complete_one_date_regex = r'(x )(' + date_regex + r')'
  212. complete_two_date_regex = (complete_one_date_regex + r'( )(' +
  213. date_regex + r')')
  214. priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')'
  215. tokens = {
  216. # Should parse starting at beginning of line; each line is a task
  217. 'root': [
  218. # Complete task entry points: two total:
  219. # 1. Complete task with two dates
  220. (complete_two_date_regex, bygroups(CompleteTaskText, Date,
  221. CompleteTaskText, Date),
  222. 'complete'),
  223. # 2. Complete task with one date
  224. (complete_one_date_regex, bygroups(CompleteTaskText, Date),
  225. 'complete'),
  226. # Incomplete task entry points: six total:
  227. # 1. Priority plus date
  228. (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date),
  229. 'incomplete'),
  230. # 2. Priority only
  231. (priority_regex, Priority, 'incomplete'),
  232. # 3. Leading date
  233. (date_regex, Date, 'incomplete'),
  234. # 4. Leading context
  235. (context_regex, Context, 'incomplete'),
  236. # 5. Leading project
  237. (project_regex, Project, 'incomplete'),
  238. # 6. Non-whitespace catch-all
  239. (r'\S+', IncompleteTaskText, 'incomplete'),
  240. ],
  241. # Parse a complete task
  242. 'complete': [
  243. # Newline indicates end of task, should return to root
  244. (r'\s*\n', CompleteTaskText, '#pop'),
  245. # Tokenize contexts and projects
  246. (context_regex, Context),
  247. (project_regex, Project),
  248. # Tokenize non-whitespace text
  249. (r'\S+', CompleteTaskText),
  250. # Tokenize whitespace not containing a newline
  251. (r'\s+', CompleteTaskText),
  252. ],
  253. # Parse an incomplete task
  254. 'incomplete': [
  255. # Newline indicates end of task, should return to root
  256. (r'\s*\n', IncompleteTaskText, '#pop'),
  257. # Tokenize contexts and projects
  258. (context_regex, Context),
  259. (project_regex, Project),
  260. # Tokenize non-whitespace text
  261. (r'\S+', IncompleteTaskText),
  262. # Tokenize whitespace not containing a newline
  263. (r'\s+', IncompleteTaskText),
  264. ],
  265. }
  266. class NotmuchLexer(RegexLexer):
  267. """
  268. For Notmuch email text format.
  269. .. versionadded:: 2.5
  270. Additional options accepted:
  271. `body_lexer`
  272. If given, highlight the contents of the message body with the specified
  273. lexer, else guess it according to the body content (default: ``None``).
  274. """
  275. name = 'Notmuch'
  276. url = 'https://notmuchmail.org/'
  277. aliases = ['notmuch']
  278. def _highlight_code(self, match):
  279. code = match.group(1)
  280. try:
  281. if self.body_lexer:
  282. lexer = get_lexer_by_name(self.body_lexer)
  283. else:
  284. lexer = guess_lexer(code.strip())
  285. except ClassNotFound:
  286. lexer = get_lexer_by_name('text')
  287. yield from lexer.get_tokens_unprocessed(code)
  288. tokens = {
  289. 'root': [
  290. (r'\fmessage\{\s*', Keyword, ('message', 'message-attr')),
  291. ],
  292. 'message-attr': [
  293. (r'(\s*id:\s*)(\S+)', bygroups(Name.Attribute, String)),
  294. (r'(\s*(?:depth|match|excluded):\s*)(\d+)',
  295. bygroups(Name.Attribute, Number.Integer)),
  296. (r'(\s*filename:\s*)(.+\n)',
  297. bygroups(Name.Attribute, String)),
  298. default('#pop'),
  299. ],
  300. 'message': [
  301. (r'\fmessage\}\n', Keyword, '#pop'),
  302. (r'\fheader\{\n', Keyword, 'header'),
  303. (r'\fbody\{\n', Keyword, 'body'),
  304. ],
  305. 'header': [
  306. (r'\fheader\}\n', Keyword, '#pop'),
  307. (r'((?:Subject|From|To|Cc|Date):\s*)(.*\n)',
  308. bygroups(Name.Attribute, String)),
  309. (r'(.*)(\s*\(.*\))(\s*\(.*\)\n)',
  310. bygroups(Generic.Strong, Literal, Name.Tag)),
  311. ],
  312. 'body': [
  313. (r'\fpart\{\n', Keyword, 'part'),
  314. (r'\f(part|attachment)\{\s*', Keyword, ('part', 'part-attr')),
  315. (r'\fbody\}\n', Keyword, '#pop'),
  316. ],
  317. 'part-attr': [
  318. (r'(ID:\s*)(\d+)', bygroups(Name.Attribute, Number.Integer)),
  319. (r'(,\s*)((?:Filename|Content-id):\s*)([^,]+)',
  320. bygroups(Punctuation, Name.Attribute, String)),
  321. (r'(,\s*)(Content-type:\s*)(.+\n)',
  322. bygroups(Punctuation, Name.Attribute, String)),
  323. default('#pop'),
  324. ],
  325. 'part': [
  326. (r'\f(?:part|attachment)\}\n', Keyword, '#pop'),
  327. (r'\f(?:part|attachment)\{\s*', Keyword, ('#push', 'part-attr')),
  328. (r'^Non-text part: .*\n', Comment),
  329. (r'(?s)(.*?(?=\f(?:part|attachment)\}\n))', _highlight_code),
  330. ],
  331. }
  332. def analyse_text(text):
  333. return 1.0 if text.startswith('\fmessage{') else 0.0
  334. def __init__(self, **options):
  335. self.body_lexer = options.get('body_lexer', None)
  336. RegexLexer.__init__(self, **options)
  337. class KernelLogLexer(RegexLexer):
  338. """
  339. For Linux Kernel log ("dmesg") output.
  340. .. versionadded:: 2.6
  341. """
  342. name = 'Kernel log'
  343. aliases = ['kmsg', 'dmesg']
  344. filenames = ['*.kmsg', '*.dmesg']
  345. tokens = {
  346. 'root': [
  347. (r'^[^:]+:debug : (?=\[)', Text, 'debug'),
  348. (r'^[^:]+:info : (?=\[)', Text, 'info'),
  349. (r'^[^:]+:warn : (?=\[)', Text, 'warn'),
  350. (r'^[^:]+:notice: (?=\[)', Text, 'warn'),
  351. (r'^[^:]+:err : (?=\[)', Text, 'error'),
  352. (r'^[^:]+:crit : (?=\[)', Text, 'error'),
  353. (r'^(?=\[)', Text, 'unknown'),
  354. ],
  355. 'unknown': [
  356. (r'^(?=.+(warning|notice|audit|deprecated))', Text, 'warn'),
  357. (r'^(?=.+(error|critical|fail|Bug))', Text, 'error'),
  358. default('info'),
  359. ],
  360. 'base': [
  361. (r'\[[0-9. ]+\] ', Number),
  362. (r'(?<=\] ).+?:', Keyword),
  363. (r'\n', Text, '#pop'),
  364. ],
  365. 'debug': [
  366. include('base'),
  367. (r'.+\n', Comment, '#pop')
  368. ],
  369. 'info': [
  370. include('base'),
  371. (r'.+\n', Text, '#pop')
  372. ],
  373. 'warn': [
  374. include('base'),
  375. (r'.+\n', Generic.Strong, '#pop')
  376. ],
  377. 'error': [
  378. include('base'),
  379. (r'.+\n', Generic.Error, '#pop')
  380. ]
  381. }