multipart.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. from __future__ import annotations
  2. import re
  3. import typing as t
  4. from dataclasses import dataclass
  5. from enum import auto
  6. from enum import Enum
  7. from ..datastructures import Headers
  8. from ..exceptions import RequestEntityTooLarge
  9. from ..http import parse_options_header
  10. class Event:
  11. pass
  12. @dataclass(frozen=True)
  13. class Preamble(Event):
  14. data: bytes
  15. @dataclass(frozen=True)
  16. class Field(Event):
  17. name: str
  18. headers: Headers
  19. @dataclass(frozen=True)
  20. class File(Event):
  21. name: str
  22. filename: str
  23. headers: Headers
  24. @dataclass(frozen=True)
  25. class Data(Event):
  26. data: bytes
  27. more_data: bool
  28. @dataclass(frozen=True)
  29. class Epilogue(Event):
  30. data: bytes
  31. class NeedData(Event):
  32. pass
  33. NEED_DATA = NeedData()
  34. class State(Enum):
  35. PREAMBLE = auto()
  36. PART = auto()
  37. DATA = auto()
  38. DATA_START = auto()
  39. EPILOGUE = auto()
  40. COMPLETE = auto()
  41. # Multipart line breaks MUST be CRLF (\r\n) by RFC-7578, except that
  42. # many implementations break this and either use CR or LF alone.
  43. LINE_BREAK = b"(?:\r\n|\n|\r)"
  44. BLANK_LINE_RE = re.compile(b"(?:\r\n\r\n|\r\r|\n\n)", re.MULTILINE)
  45. LINE_BREAK_RE = re.compile(LINE_BREAK, re.MULTILINE)
  46. # Header values can be continued via a space or tab after the linebreak, as
  47. # per RFC2231
  48. HEADER_CONTINUATION_RE = re.compile(b"%s[ \t]" % LINE_BREAK, re.MULTILINE)
  49. # This must be long enough to contain any line breaks plus any
  50. # additional boundary markers (--) such that they will be found in a
  51. # subsequent search
  52. SEARCH_EXTRA_LENGTH = 8
  53. class MultipartDecoder:
  54. """Decodes a multipart message as bytes into Python events.
  55. The part data is returned as available to allow the caller to save
  56. the data from memory to disk, if desired.
  57. """
  58. def __init__(
  59. self,
  60. boundary: bytes,
  61. max_form_memory_size: int | None = None,
  62. *,
  63. max_parts: int | None = None,
  64. ) -> None:
  65. self.buffer = bytearray()
  66. self.complete = False
  67. self.max_form_memory_size = max_form_memory_size
  68. self.max_parts = max_parts
  69. self.state = State.PREAMBLE
  70. self.boundary = boundary
  71. # Note in the below \h i.e. horizontal whitespace is used
  72. # as [^\S\n\r] as \h isn't supported in python.
  73. # The preamble must end with a boundary where the boundary is
  74. # prefixed by a line break, RFC2046. Except that many
  75. # implementations including Werkzeug's tests omit the line
  76. # break prefix. In addition the first boundary could be the
  77. # epilogue boundary (for empty form-data) hence the matching
  78. # group to understand if it is an epilogue boundary.
  79. self.preamble_re = re.compile(
  80. rb"%s?--%s(--[^\S\n\r]*%s?|[^\S\n\r]*%s)"
  81. % (LINE_BREAK, re.escape(boundary), LINE_BREAK, LINE_BREAK),
  82. re.MULTILINE,
  83. )
  84. # A boundary must include a line break prefix and suffix, and
  85. # may include trailing whitespace. In addition the boundary
  86. # could be the epilogue boundary hence the matching group to
  87. # understand if it is an epilogue boundary.
  88. self.boundary_re = re.compile(
  89. rb"%s--%s(--[^\S\n\r]*%s?|[^\S\n\r]*%s)"
  90. % (LINE_BREAK, re.escape(boundary), LINE_BREAK, LINE_BREAK),
  91. re.MULTILINE,
  92. )
  93. self._search_position = 0
  94. self._parts_decoded = 0
  95. def last_newline(self, data: bytes) -> int:
  96. try:
  97. last_nl = data.rindex(b"\n")
  98. except ValueError:
  99. last_nl = len(data)
  100. try:
  101. last_cr = data.rindex(b"\r")
  102. except ValueError:
  103. last_cr = len(data)
  104. return min(last_nl, last_cr)
  105. def receive_data(self, data: bytes | None) -> None:
  106. if data is None:
  107. self.complete = True
  108. elif (
  109. self.max_form_memory_size is not None
  110. and len(self.buffer) + len(data) > self.max_form_memory_size
  111. ):
  112. raise RequestEntityTooLarge()
  113. else:
  114. self.buffer.extend(data)
  115. def next_event(self) -> Event:
  116. event: Event = NEED_DATA
  117. if self.state == State.PREAMBLE:
  118. match = self.preamble_re.search(self.buffer, self._search_position)
  119. if match is not None:
  120. if match.group(1).startswith(b"--"):
  121. self.state = State.EPILOGUE
  122. else:
  123. self.state = State.PART
  124. data = bytes(self.buffer[: match.start()])
  125. del self.buffer[: match.end()]
  126. event = Preamble(data=data)
  127. self._search_position = 0
  128. else:
  129. # Update the search start position to be equal to the
  130. # current buffer length (already searched) minus a
  131. # safe buffer for part of the search target.
  132. self._search_position = max(
  133. 0, len(self.buffer) - len(self.boundary) - SEARCH_EXTRA_LENGTH
  134. )
  135. elif self.state == State.PART:
  136. match = BLANK_LINE_RE.search(self.buffer, self._search_position)
  137. if match is not None:
  138. headers = self._parse_headers(self.buffer[: match.start()])
  139. # The final header ends with a single CRLF, however a
  140. # blank line indicates the start of the
  141. # body. Therefore the end is after the first CRLF.
  142. headers_end = (match.start() + match.end()) // 2
  143. del self.buffer[:headers_end]
  144. if "content-disposition" not in headers:
  145. raise ValueError("Missing Content-Disposition header")
  146. disposition, extra = parse_options_header(
  147. headers["content-disposition"]
  148. )
  149. name = t.cast(str, extra.get("name"))
  150. filename = extra.get("filename")
  151. if filename is not None:
  152. event = File(
  153. filename=filename,
  154. headers=headers,
  155. name=name,
  156. )
  157. else:
  158. event = Field(
  159. headers=headers,
  160. name=name,
  161. )
  162. self.state = State.DATA_START
  163. self._search_position = 0
  164. self._parts_decoded += 1
  165. if self.max_parts is not None and self._parts_decoded > self.max_parts:
  166. raise RequestEntityTooLarge()
  167. else:
  168. # Update the search start position to be equal to the
  169. # current buffer length (already searched) minus a
  170. # safe buffer for part of the search target.
  171. self._search_position = max(0, len(self.buffer) - SEARCH_EXTRA_LENGTH)
  172. elif self.state == State.DATA_START:
  173. data, del_index, more_data = self._parse_data(self.buffer, start=True)
  174. del self.buffer[:del_index]
  175. event = Data(data=data, more_data=more_data)
  176. if more_data:
  177. self.state = State.DATA
  178. elif self.state == State.DATA:
  179. data, del_index, more_data = self._parse_data(self.buffer, start=False)
  180. del self.buffer[:del_index]
  181. if data or not more_data:
  182. event = Data(data=data, more_data=more_data)
  183. elif self.state == State.EPILOGUE and self.complete:
  184. event = Epilogue(data=bytes(self.buffer))
  185. del self.buffer[:]
  186. self.state = State.COMPLETE
  187. if self.complete and isinstance(event, NeedData):
  188. raise ValueError(f"Invalid form-data cannot parse beyond {self.state}")
  189. return event
  190. def _parse_headers(self, data: bytes) -> Headers:
  191. headers: list[tuple[str, str]] = []
  192. # Merge the continued headers into one line
  193. data = HEADER_CONTINUATION_RE.sub(b" ", data)
  194. # Now there is one header per line
  195. for line in data.splitlines():
  196. line = line.strip()
  197. if line != b"":
  198. name, _, value = line.decode().partition(":")
  199. headers.append((name.strip(), value.strip()))
  200. return Headers(headers)
  201. def _parse_data(self, data: bytes, *, start: bool) -> tuple[bytes, int, bool]:
  202. # Body parts must start with CRLF (or CR or LF)
  203. if start:
  204. match = LINE_BREAK_RE.match(data)
  205. data_start = t.cast(t.Match[bytes], match).end()
  206. else:
  207. data_start = 0
  208. boundary = b"--" + self.boundary
  209. if self.buffer.find(boundary) == -1:
  210. # No complete boundary in the buffer, but there may be
  211. # a partial boundary at the end. As the boundary
  212. # starts with either a nl or cr find the earliest and
  213. # return up to that as data.
  214. data_end = del_index = self.last_newline(data[data_start:]) + data_start
  215. # If amount of data after last newline is far from
  216. # possible length of partial boundary, we should
  217. # assume that there is no partial boundary in the buffer
  218. # and return all pending data.
  219. if (len(data) - data_end) > len(b"\n" + boundary):
  220. data_end = del_index = len(data)
  221. more_data = True
  222. else:
  223. match = self.boundary_re.search(data)
  224. if match is not None:
  225. if match.group(1).startswith(b"--"):
  226. self.state = State.EPILOGUE
  227. else:
  228. self.state = State.PART
  229. data_end = match.start()
  230. del_index = match.end()
  231. else:
  232. data_end = del_index = self.last_newline(data[data_start:]) + data_start
  233. more_data = match is None
  234. return bytes(data[data_start:data_end]), del_index, more_data
  235. class MultipartEncoder:
  236. def __init__(self, boundary: bytes) -> None:
  237. self.boundary = boundary
  238. self.state = State.PREAMBLE
  239. def send_event(self, event: Event) -> bytes:
  240. if isinstance(event, Preamble) and self.state == State.PREAMBLE:
  241. self.state = State.PART
  242. return event.data
  243. elif isinstance(event, (Field, File)) and self.state in {
  244. State.PREAMBLE,
  245. State.PART,
  246. State.DATA,
  247. }:
  248. data = b"\r\n--" + self.boundary + b"\r\n"
  249. data += b'Content-Disposition: form-data; name="%s"' % event.name.encode()
  250. if isinstance(event, File):
  251. data += b'; filename="%s"' % event.filename.encode()
  252. data += b"\r\n"
  253. for name, value in t.cast(Field, event).headers:
  254. if name.lower() != "content-disposition":
  255. data += f"{name}: {value}\r\n".encode()
  256. self.state = State.DATA_START
  257. return data
  258. elif isinstance(event, Data) and self.state == State.DATA_START:
  259. self.state = State.DATA
  260. if len(event.data) > 0:
  261. return b"\r\n" + event.data
  262. else:
  263. return event.data
  264. elif isinstance(event, Data) and self.state == State.DATA:
  265. return event.data
  266. elif isinstance(event, Epilogue):
  267. self.state = State.COMPLETE
  268. return b"\r\n--" + self.boundary + b"--\r\n" + event.data
  269. else:
  270. raise ValueError(f"Cannot generate {event} in state: {self.state}")