rdf.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. """
  2. pygments.lexers.rdf
  3. ~~~~~~~~~~~~~~~~~~~
  4. Lexers for semantic web and RDF query languages and markup.
  5. :copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. from pygments.lexer import RegexLexer, bygroups, default
  10. from pygments.token import Keyword, Punctuation, String, Number, Operator, \
  11. Generic, Whitespace, Name, Literal, Comment, Text
  12. __all__ = ['SparqlLexer', 'TurtleLexer', 'ShExCLexer']
  13. class SparqlLexer(RegexLexer):
  14. """
  15. Lexer for `SPARQL <https://www.w3.org/TR/sparql11-query/>`_ query language.
  16. .. versionadded:: 2.0
  17. """
  18. name = 'SPARQL'
  19. aliases = ['sparql']
  20. filenames = ['*.rq', '*.sparql']
  21. mimetypes = ['application/sparql-query']
  22. # character group definitions ::
  23. PN_CHARS_BASE_GRP = ('a-zA-Z'
  24. '\u00c0-\u00d6'
  25. '\u00d8-\u00f6'
  26. '\u00f8-\u02ff'
  27. '\u0370-\u037d'
  28. '\u037f-\u1fff'
  29. '\u200c-\u200d'
  30. '\u2070-\u218f'
  31. '\u2c00-\u2fef'
  32. '\u3001-\ud7ff'
  33. '\uf900-\ufdcf'
  34. '\ufdf0-\ufffd')
  35. PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
  36. PN_CHARS_GRP = (PN_CHARS_U_GRP +
  37. r'\-' +
  38. r'0-9' +
  39. '\u00b7' +
  40. '\u0300-\u036f' +
  41. '\u203f-\u2040')
  42. HEX_GRP = '0-9A-Fa-f'
  43. PN_LOCAL_ESC_CHARS_GRP = r' _~.\-!$&"()*+,;=/?#@%'
  44. # terminal productions ::
  45. PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
  46. PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']'
  47. PN_CHARS = '[' + PN_CHARS_GRP + ']'
  48. HEX = '[' + HEX_GRP + ']'
  49. PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
  50. IRIREF = r'<(?:[^<>"{}|^`\\\x00-\x20])*>'
  51. BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \
  52. '.]*' + PN_CHARS + ')?'
  53. PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
  54. VARNAME = '[0-9' + PN_CHARS_U_GRP + '][' + PN_CHARS_U_GRP + \
  55. '0-9\u00b7\u0300-\u036f\u203f-\u2040]*'
  56. PERCENT = '%' + HEX + HEX
  57. PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
  58. PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
  59. PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
  60. '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
  61. PN_CHARS_GRP + ':]|' + PLX + '))?')
  62. EXPONENT = r'[eE][+-]?\d+'
  63. # Lexer token definitions ::
  64. tokens = {
  65. 'root': [
  66. (r'\s+', Text),
  67. # keywords ::
  68. (r'(?i)(select|construct|describe|ask|where|filter|group\s+by|minus|'
  69. r'distinct|reduced|from\s+named|from|order\s+by|desc|asc|limit|'
  70. r'offset|values|bindings|load|into|clear|drop|create|add|move|copy|'
  71. r'insert\s+data|delete\s+data|delete\s+where|with|delete|insert|'
  72. r'using\s+named|using|graph|default|named|all|optional|service|'
  73. r'silent|bind|undef|union|not\s+in|in|as|having|to|prefix|base)\b', Keyword),
  74. (r'(a)\b', Keyword),
  75. # IRIs ::
  76. ('(' + IRIREF + ')', Name.Label),
  77. # blank nodes ::
  78. ('(' + BLANK_NODE_LABEL + ')', Name.Label),
  79. # # variables ::
  80. ('[?$]' + VARNAME, Name.Variable),
  81. # prefixed names ::
  82. (r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + r')?',
  83. bygroups(Name.Namespace, Punctuation, Name.Tag)),
  84. # function names ::
  85. (r'(?i)(str|lang|langmatches|datatype|bound|iri|uri|bnode|rand|abs|'
  86. r'ceil|floor|round|concat|strlen|ucase|lcase|encode_for_uri|'
  87. r'contains|strstarts|strends|strbefore|strafter|year|month|day|'
  88. r'hours|minutes|seconds|timezone|tz|now|uuid|struuid|md5|sha1|sha256|sha384|'
  89. r'sha512|coalesce|if|strlang|strdt|sameterm|isiri|isuri|isblank|'
  90. r'isliteral|isnumeric|regex|substr|replace|exists|not\s+exists|'
  91. r'count|sum|min|max|avg|sample|group_concat|separator)\b',
  92. Name.Function),
  93. # boolean literals ::
  94. (r'(true|false)', Keyword.Constant),
  95. # double literals ::
  96. (r'[+\-]?(\d+\.\d*' + EXPONENT + r'|\.?\d+' + EXPONENT + ')', Number.Float),
  97. # decimal literals ::
  98. (r'[+\-]?(\d+\.\d*|\.\d+)', Number.Float),
  99. # integer literals ::
  100. (r'[+\-]?\d+', Number.Integer),
  101. # operators ::
  102. (r'(\|\||&&|=|\*|\-|\+|/|!=|<=|>=|!|<|>)', Operator),
  103. # punctuation characters ::
  104. (r'[(){}.;,:^\[\]]', Punctuation),
  105. # line comments ::
  106. (r'#[^\n]*', Comment),
  107. # strings ::
  108. (r'"""', String, 'triple-double-quoted-string'),
  109. (r'"', String, 'single-double-quoted-string'),
  110. (r"'''", String, 'triple-single-quoted-string'),
  111. (r"'", String, 'single-single-quoted-string'),
  112. ],
  113. 'triple-double-quoted-string': [
  114. (r'"""', String, 'end-of-string'),
  115. (r'[^\\]+', String),
  116. (r'\\', String, 'string-escape'),
  117. ],
  118. 'single-double-quoted-string': [
  119. (r'"', String, 'end-of-string'),
  120. (r'[^"\\\n]+', String),
  121. (r'\\', String, 'string-escape'),
  122. ],
  123. 'triple-single-quoted-string': [
  124. (r"'''", String, 'end-of-string'),
  125. (r'[^\\]+', String),
  126. (r'\\', String.Escape, 'string-escape'),
  127. ],
  128. 'single-single-quoted-string': [
  129. (r"'", String, 'end-of-string'),
  130. (r"[^'\\\n]+", String),
  131. (r'\\', String, 'string-escape'),
  132. ],
  133. 'string-escape': [
  134. (r'u' + HEX + '{4}', String.Escape, '#pop'),
  135. (r'U' + HEX + '{8}', String.Escape, '#pop'),
  136. (r'.', String.Escape, '#pop'),
  137. ],
  138. 'end-of-string': [
  139. (r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
  140. bygroups(Operator, Name.Function), '#pop:2'),
  141. (r'\^\^', Operator, '#pop:2'),
  142. default('#pop:2'),
  143. ],
  144. }
  145. class TurtleLexer(RegexLexer):
  146. """
  147. Lexer for `Turtle <http://www.w3.org/TR/turtle/>`_ data language.
  148. .. versionadded:: 2.1
  149. """
  150. name = 'Turtle'
  151. aliases = ['turtle']
  152. filenames = ['*.ttl']
  153. mimetypes = ['text/turtle', 'application/x-turtle']
  154. # character group definitions ::
  155. PN_CHARS_BASE_GRP = ('a-zA-Z'
  156. '\u00c0-\u00d6'
  157. '\u00d8-\u00f6'
  158. '\u00f8-\u02ff'
  159. '\u0370-\u037d'
  160. '\u037f-\u1fff'
  161. '\u200c-\u200d'
  162. '\u2070-\u218f'
  163. '\u2c00-\u2fef'
  164. '\u3001-\ud7ff'
  165. '\uf900-\ufdcf'
  166. '\ufdf0-\ufffd')
  167. PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
  168. PN_CHARS_GRP = (PN_CHARS_U_GRP +
  169. r'\-' +
  170. r'0-9' +
  171. '\u00b7' +
  172. '\u0300-\u036f' +
  173. '\u203f-\u2040')
  174. PN_CHARS = '[' + PN_CHARS_GRP + ']'
  175. PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
  176. PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
  177. HEX_GRP = '0-9A-Fa-f'
  178. HEX = '[' + HEX_GRP + ']'
  179. PERCENT = '%' + HEX + HEX
  180. PN_LOCAL_ESC_CHARS_GRP = r' _~.\-!$&"()*+,;=/?#@%'
  181. PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
  182. PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
  183. PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
  184. PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
  185. '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
  186. PN_CHARS_GRP + ':]|' + PLX + '))?')
  187. patterns = {
  188. 'PNAME_NS': r'((?:[a-zA-Z][\w-]*)?\:)', # Simplified character range
  189. 'IRIREF': r'(<[^<>"{}|^`\\\x00-\x20]*>)'
  190. }
  191. tokens = {
  192. 'root': [
  193. (r'\s+', Text),
  194. # Base / prefix
  195. (r'(@base|BASE)(\s+)%(IRIREF)s(\s*)(\.?)' % patterns,
  196. bygroups(Keyword, Whitespace, Name.Variable, Whitespace,
  197. Punctuation)),
  198. (r'(@prefix|PREFIX)(\s+)%(PNAME_NS)s(\s+)%(IRIREF)s(\s*)(\.?)' % patterns,
  199. bygroups(Keyword, Whitespace, Name.Namespace, Whitespace,
  200. Name.Variable, Whitespace, Punctuation)),
  201. # The shorthand predicate 'a'
  202. (r'(?<=\s)a(?=\s)', Keyword.Type),
  203. # IRIREF
  204. (r'%(IRIREF)s' % patterns, Name.Variable),
  205. # PrefixedName
  206. (r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + r')?',
  207. bygroups(Name.Namespace, Punctuation, Name.Tag)),
  208. # Comment
  209. (r'#[^\n]+', Comment),
  210. (r'\b(true|false)\b', Literal),
  211. (r'[+\-]?\d*\.\d+', Number.Float),
  212. (r'[+\-]?\d*(:?\.\d+)?E[+\-]?\d+', Number.Float),
  213. (r'[+\-]?\d+', Number.Integer),
  214. (r'[\[\](){}.;,:^]', Punctuation),
  215. (r'"""', String, 'triple-double-quoted-string'),
  216. (r'"', String, 'single-double-quoted-string'),
  217. (r"'''", String, 'triple-single-quoted-string'),
  218. (r"'", String, 'single-single-quoted-string'),
  219. ],
  220. 'triple-double-quoted-string': [
  221. (r'"""', String, 'end-of-string'),
  222. (r'[^\\]+', String),
  223. (r'\\', String, 'string-escape'),
  224. ],
  225. 'single-double-quoted-string': [
  226. (r'"', String, 'end-of-string'),
  227. (r'[^"\\\n]+', String),
  228. (r'\\', String, 'string-escape'),
  229. ],
  230. 'triple-single-quoted-string': [
  231. (r"'''", String, 'end-of-string'),
  232. (r'[^\\]+', String),
  233. (r'\\', String, 'string-escape'),
  234. ],
  235. 'single-single-quoted-string': [
  236. (r"'", String, 'end-of-string'),
  237. (r"[^'\\\n]+", String),
  238. (r'\\', String, 'string-escape'),
  239. ],
  240. 'string-escape': [
  241. (r'.', String, '#pop'),
  242. ],
  243. 'end-of-string': [
  244. (r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
  245. bygroups(Operator, Generic.Emph), '#pop:2'),
  246. (r'(\^\^)%(IRIREF)s' % patterns, bygroups(Operator, Generic.Emph), '#pop:2'),
  247. default('#pop:2'),
  248. ],
  249. }
  250. # Turtle and Tera Term macro files share the same file extension
  251. # but each has a recognizable and distinct syntax.
  252. def analyse_text(text):
  253. for t in ('@base ', 'BASE ', '@prefix ', 'PREFIX '):
  254. if re.search(r'^\s*%s' % t, text):
  255. return 0.80
  256. class ShExCLexer(RegexLexer):
  257. """
  258. Lexer for `ShExC <https://shex.io/shex-semantics/#shexc>`_ shape expressions language syntax.
  259. """
  260. name = 'ShExC'
  261. aliases = ['shexc', 'shex']
  262. filenames = ['*.shex']
  263. mimetypes = ['text/shex']
  264. # character group definitions ::
  265. PN_CHARS_BASE_GRP = ('a-zA-Z'
  266. '\u00c0-\u00d6'
  267. '\u00d8-\u00f6'
  268. '\u00f8-\u02ff'
  269. '\u0370-\u037d'
  270. '\u037f-\u1fff'
  271. '\u200c-\u200d'
  272. '\u2070-\u218f'
  273. '\u2c00-\u2fef'
  274. '\u3001-\ud7ff'
  275. '\uf900-\ufdcf'
  276. '\ufdf0-\ufffd')
  277. PN_CHARS_U_GRP = (PN_CHARS_BASE_GRP + '_')
  278. PN_CHARS_GRP = (PN_CHARS_U_GRP +
  279. r'\-' +
  280. r'0-9' +
  281. '\u00b7' +
  282. '\u0300-\u036f' +
  283. '\u203f-\u2040')
  284. HEX_GRP = '0-9A-Fa-f'
  285. PN_LOCAL_ESC_CHARS_GRP = r"_~.\-!$&'()*+,;=/?#@%"
  286. # terminal productions ::
  287. PN_CHARS_BASE = '[' + PN_CHARS_BASE_GRP + ']'
  288. PN_CHARS_U = '[' + PN_CHARS_U_GRP + ']'
  289. PN_CHARS = '[' + PN_CHARS_GRP + ']'
  290. HEX = '[' + HEX_GRP + ']'
  291. PN_LOCAL_ESC_CHARS = '[' + PN_LOCAL_ESC_CHARS_GRP + ']'
  292. UCHAR_NO_BACKSLASH = '(?:u' + HEX + '{4}|U' + HEX + '{8})'
  293. UCHAR = r'\\' + UCHAR_NO_BACKSLASH
  294. IRIREF = r'<(?:[^\x00-\x20<>"{}|^`\\]|' + UCHAR + ')*>'
  295. BLANK_NODE_LABEL = '_:[0-9' + PN_CHARS_U_GRP + '](?:[' + PN_CHARS_GRP + \
  296. '.]*' + PN_CHARS + ')?'
  297. PN_PREFIX = PN_CHARS_BASE + '(?:[' + PN_CHARS_GRP + '.]*' + PN_CHARS + ')?'
  298. PERCENT = '%' + HEX + HEX
  299. PN_LOCAL_ESC = r'\\' + PN_LOCAL_ESC_CHARS
  300. PLX = '(?:' + PERCENT + ')|(?:' + PN_LOCAL_ESC + ')'
  301. PN_LOCAL = ('(?:[' + PN_CHARS_U_GRP + ':0-9' + ']|' + PLX + ')' +
  302. '(?:(?:[' + PN_CHARS_GRP + '.:]|' + PLX + ')*(?:[' +
  303. PN_CHARS_GRP + ':]|' + PLX + '))?')
  304. EXPONENT = r'[eE][+-]?\d+'
  305. # Lexer token definitions ::
  306. tokens = {
  307. 'root': [
  308. (r'\s+', Text),
  309. # keywords ::
  310. (r'(?i)(base|prefix|start|external|'
  311. r'literal|iri|bnode|nonliteral|length|minlength|maxlength|'
  312. r'mininclusive|minexclusive|maxinclusive|maxexclusive|'
  313. r'totaldigits|fractiondigits|'
  314. r'closed|extra)\b', Keyword),
  315. (r'(a)\b', Keyword),
  316. # IRIs ::
  317. ('(' + IRIREF + ')', Name.Label),
  318. # blank nodes ::
  319. ('(' + BLANK_NODE_LABEL + ')', Name.Label),
  320. # prefixed names ::
  321. (r'(' + PN_PREFIX + r')?(\:)(' + PN_LOCAL + ')?',
  322. bygroups(Name.Namespace, Punctuation, Name.Tag)),
  323. # boolean literals ::
  324. (r'(true|false)', Keyword.Constant),
  325. # double literals ::
  326. (r'[+\-]?(\d+\.\d*' + EXPONENT + r'|\.?\d+' + EXPONENT + ')', Number.Float),
  327. # decimal literals ::
  328. (r'[+\-]?(\d+\.\d*|\.\d+)', Number.Float),
  329. # integer literals ::
  330. (r'[+\-]?\d+', Number.Integer),
  331. # operators ::
  332. (r'[@|$&=*+?^\-~]', Operator),
  333. # operator keywords ::
  334. (r'(?i)(and|or|not)\b', Operator.Word),
  335. # punctuation characters ::
  336. (r'[(){}.;,:^\[\]]', Punctuation),
  337. # line comments ::
  338. (r'#[^\n]*', Comment),
  339. # strings ::
  340. (r'"""', String, 'triple-double-quoted-string'),
  341. (r'"', String, 'single-double-quoted-string'),
  342. (r"'''", String, 'triple-single-quoted-string'),
  343. (r"'", String, 'single-single-quoted-string'),
  344. ],
  345. 'triple-double-quoted-string': [
  346. (r'"""', String, 'end-of-string'),
  347. (r'[^\\]+', String),
  348. (r'\\', String, 'string-escape'),
  349. ],
  350. 'single-double-quoted-string': [
  351. (r'"', String, 'end-of-string'),
  352. (r'[^"\\\n]+', String),
  353. (r'\\', String, 'string-escape'),
  354. ],
  355. 'triple-single-quoted-string': [
  356. (r"'''", String, 'end-of-string'),
  357. (r'[^\\]+', String),
  358. (r'\\', String.Escape, 'string-escape'),
  359. ],
  360. 'single-single-quoted-string': [
  361. (r"'", String, 'end-of-string'),
  362. (r"[^'\\\n]+", String),
  363. (r'\\', String, 'string-escape'),
  364. ],
  365. 'string-escape': [
  366. (UCHAR_NO_BACKSLASH, String.Escape, '#pop'),
  367. (r'.', String.Escape, '#pop'),
  368. ],
  369. 'end-of-string': [
  370. (r'(@)([a-zA-Z]+(?:-[a-zA-Z0-9]+)*)',
  371. bygroups(Operator, Name.Function), '#pop:2'),
  372. (r'\^\^', Operator, '#pop:2'),
  373. default('#pop:2'),
  374. ],
  375. }