urls.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. from __future__ import annotations
  2. import codecs
  3. import re
  4. import typing as t
  5. from urllib.parse import quote
  6. from urllib.parse import unquote
  7. from urllib.parse import urlencode
  8. from urllib.parse import urlsplit
  9. from urllib.parse import urlunsplit
  10. from .datastructures import iter_multi_items
  11. def _codec_error_url_quote(e: UnicodeError) -> tuple[str, int]:
  12. """Used in :func:`uri_to_iri` after unquoting to re-quote any
  13. invalid bytes.
  14. """
  15. # the docs state that UnicodeError does have these attributes,
  16. # but mypy isn't picking them up
  17. out = quote(e.object[e.start : e.end], safe="") # type: ignore
  18. return out, e.end # type: ignore
  19. codecs.register_error("werkzeug.url_quote", _codec_error_url_quote)
  20. def _make_unquote_part(name: str, chars: str) -> t.Callable[[str], str]:
  21. """Create a function that unquotes all percent encoded characters except those
  22. given. This allows working with unquoted characters if possible while not changing
  23. the meaning of a given part of a URL.
  24. """
  25. choices = "|".join(f"{ord(c):02X}" for c in sorted(chars))
  26. pattern = re.compile(f"((?:%(?:{choices}))+)", re.I)
  27. def _unquote_partial(value: str) -> str:
  28. parts = iter(pattern.split(value))
  29. out = []
  30. for part in parts:
  31. out.append(unquote(part, "utf-8", "werkzeug.url_quote"))
  32. out.append(next(parts, ""))
  33. return "".join(out)
  34. _unquote_partial.__name__ = f"_unquote_{name}"
  35. return _unquote_partial
  36. # characters that should remain quoted in URL parts
  37. # based on https://url.spec.whatwg.org/#percent-encoded-bytes
  38. # always keep all controls, space, and % quoted
  39. _always_unsafe = bytes((*range(0x21), 0x25, 0x7F)).decode()
  40. _unquote_fragment = _make_unquote_part("fragment", _always_unsafe)
  41. _unquote_query = _make_unquote_part("query", _always_unsafe + "&=+#")
  42. _unquote_path = _make_unquote_part("path", _always_unsafe + "/?#")
  43. _unquote_user = _make_unquote_part("user", _always_unsafe + ":@/?#")
  44. def uri_to_iri(uri: str) -> str:
  45. """Convert a URI to an IRI. All valid UTF-8 characters are unquoted,
  46. leaving all reserved and invalid characters quoted. If the URL has
  47. a domain, it is decoded from Punycode.
  48. >>> uri_to_iri("http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF")
  49. 'http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF'
  50. :param uri: The URI to convert.
  51. .. versionchanged:: 3.0
  52. Passing a tuple or bytes, and the ``charset`` and ``errors`` parameters,
  53. are removed.
  54. .. versionchanged:: 2.3
  55. Which characters remain quoted is specific to each part of the URL.
  56. .. versionchanged:: 0.15
  57. All reserved and invalid characters remain quoted. Previously,
  58. only some reserved characters were preserved, and invalid bytes
  59. were replaced instead of left quoted.
  60. .. versionadded:: 0.6
  61. """
  62. parts = urlsplit(uri)
  63. path = _unquote_path(parts.path)
  64. query = _unquote_query(parts.query)
  65. fragment = _unquote_fragment(parts.fragment)
  66. if parts.hostname:
  67. netloc = _decode_idna(parts.hostname)
  68. else:
  69. netloc = ""
  70. if ":" in netloc:
  71. netloc = f"[{netloc}]"
  72. if parts.port:
  73. netloc = f"{netloc}:{parts.port}"
  74. if parts.username:
  75. auth = _unquote_user(parts.username)
  76. if parts.password:
  77. password = _unquote_user(parts.password)
  78. auth = f"{auth}:{password}"
  79. netloc = f"{auth}@{netloc}"
  80. return urlunsplit((parts.scheme, netloc, path, query, fragment))
  81. def iri_to_uri(iri: str) -> str:
  82. """Convert an IRI to a URI. All non-ASCII and unsafe characters are
  83. quoted. If the URL has a domain, it is encoded to Punycode.
  84. >>> iri_to_uri('http://\\u2603.net/p\\xe5th?q=\\xe8ry%DF')
  85. 'http://xn--n3h.net/p%C3%A5th?q=%C3%A8ry%DF'
  86. :param iri: The IRI to convert.
  87. .. versionchanged:: 3.0
  88. Passing a tuple or bytes, the ``charset`` and ``errors`` parameters,
  89. and the ``safe_conversion`` parameter, are removed.
  90. .. versionchanged:: 2.3
  91. Which characters remain unquoted is specific to each part of the URL.
  92. .. versionchanged:: 0.15
  93. All reserved characters remain unquoted. Previously, only some reserved
  94. characters were left unquoted.
  95. .. versionchanged:: 0.9.6
  96. The ``safe_conversion`` parameter was added.
  97. .. versionadded:: 0.6
  98. """
  99. parts = urlsplit(iri)
  100. # safe = https://url.spec.whatwg.org/#url-path-segment-string
  101. # as well as percent for things that are already quoted
  102. path = quote(parts.path, safe="%!$&'()*+,/:;=@")
  103. query = quote(parts.query, safe="%!$&'()*+,/:;=?@")
  104. fragment = quote(parts.fragment, safe="%!#$&'()*+,/:;=?@")
  105. if parts.hostname:
  106. netloc = parts.hostname.encode("idna").decode("ascii")
  107. else:
  108. netloc = ""
  109. if ":" in netloc:
  110. netloc = f"[{netloc}]"
  111. if parts.port:
  112. netloc = f"{netloc}:{parts.port}"
  113. if parts.username:
  114. auth = quote(parts.username, safe="%!$&'()*+,;=")
  115. if parts.password:
  116. password = quote(parts.password, safe="%!$&'()*+,;=")
  117. auth = f"{auth}:{password}"
  118. netloc = f"{auth}@{netloc}"
  119. return urlunsplit((parts.scheme, netloc, path, query, fragment))
  120. def _invalid_iri_to_uri(iri: str) -> str:
  121. """The URL scheme ``itms-services://`` must contain the ``//`` even though it does
  122. not have a host component. There may be other invalid schemes as well. Currently,
  123. responses will always call ``iri_to_uri`` on the redirect ``Location`` header, which
  124. removes the ``//``. For now, if the IRI only contains ASCII and does not contain
  125. spaces, pass it on as-is. In Werkzeug 3.0, this should become a
  126. ``response.process_location`` flag.
  127. :meta private:
  128. """
  129. try:
  130. iri.encode("ascii")
  131. except UnicodeError:
  132. pass
  133. else:
  134. if len(iri.split(None, 1)) == 1:
  135. return iri
  136. return iri_to_uri(iri)
  137. def _decode_idna(domain: str) -> str:
  138. try:
  139. data = domain.encode("ascii")
  140. except UnicodeEncodeError:
  141. # If the domain is not ASCII, it's decoded already.
  142. return domain
  143. try:
  144. # Try decoding in one shot.
  145. return data.decode("idna")
  146. except UnicodeDecodeError:
  147. pass
  148. # Decode each part separately, leaving invalid parts as punycode.
  149. parts = []
  150. for part in data.split(b"."):
  151. try:
  152. parts.append(part.decode("idna"))
  153. except UnicodeDecodeError:
  154. parts.append(part.decode("ascii"))
  155. return ".".join(parts)
  156. def _urlencode(query: t.Mapping[str, str] | t.Iterable[tuple[str, str]]) -> str:
  157. items = [x for x in iter_multi_items(query) if x[1] is not None]
  158. # safe = https://url.spec.whatwg.org/#percent-encoded-bytes
  159. return urlencode(items, safe="!$'()*,/:;?@")