From a24b6e349b83b5156177349ba19de71ee276e084 Mon Sep 17 00:00:00 2001
From: Marcel Hellkamp <marc@gsites.de>
Date: Mon, 7 Jul 2025 13:45:47 +0200
Subject: [PATCH 1/2] Move segment parsing logic back into the parser.

This patch moves segment parsing logic from private MultipartSegment methods back into the PushMultipartParser where it belongs, and isolates the 'form-data' specific checks into a single private method to allow subclasses to support different segment and steam types (e.g. multipart/byterange or multipart/mixed).

change: MultipartSegment.name is now typed as optional, but is still guaranteed to be a string for unmodified versions of the parser.
---
 multipart.py | 271 +++++++++++++++++++++++++++++----------------------
 1 file changed, 155 insertions(+), 116 deletions(-)
diff --git a/multipart.py b/multipart.py
index f53480e..c6f4733 100644
--- a/multipart.py
+++ b/multipart.py
@@ -314,7 +314,7 @@ def __init__(
         will always trigger exceptions, even in non-strict mode.
 
         The various limits are meant as safeguards and exceeding any of those
-        limit will trigger a :exc:`ParserLimitReached` exception.
+        limits will trigger a :exc:`ParserLimitReached` exception while parsing.
 
         :param boundary: The multipart boundary as found in the Content-Type header.
         :param content_length: Expected input size in bytes, or -1 if unknown.
@@ -338,11 +338,14 @@ def __init__(
 
         # Internal parser state
         self._parsed = 0
-        self._fieldcount = 0
         self._buffer = bytearray()
-        self._current = MultipartSegment(self)
         self._state = _PREAMBLE
 
+        self._segment = None
+        self._segment_count = 0
+        self._segment_headerlist = []
+        self._segment_limit = -1
+
         #: True if the parser reached the end of the multipart stream, stopped
         #: parsing due to an :attr:`error`, or :meth:`<close>` was called.
         self.closed = False
@@ -359,20 +362,20 @@ def parse(
         self, chunk: Union[bytes, bytearray]
     ) -> Iterator[Union["MultipartSegment", bytearray, None]]:
         """Parse a chunk of data and yield as many result objects as possible
-        with the data given.
+        from the data given.
+
+        For each multipart segment, the parser will emit a single instance of
+        :class:`MultipartSegment` with all headers already present, followed by
+        zero or more non-empty `bytearray` chunks of the segment payload,
+        followed by a single `None` signaling the end of the current segment.
 
-        For each multipart segment, the parser will emit a single instance
-        of :class:`MultipartSegment` with all headers already present, followed
-        by zero or more non-empty `bytearray` instances containing parts of the
-        segment body, followed by a single `None` signaling the end of the
-        current segment.
+        The returned iterator will yield results up to the point where more
+        data is needed or the end of the multipart stream was detected. The
+        iterator must be fully consumed before feeding more data to the parser.
 
-        The returned iterator will stop if more data is required or if the end
-        of the multipart stream was detected. The iterator must be fully consumed
-        before parsing the next chunk. End of input can be signaled by parsing
-        an empty chunk or closing the parser. This is important to verify the
-        multipart message was parsed completely and the last segment is actually
-        complete.
+        End of input can be signaled by parsing an empty chunk or closing the
+        parser. This is important to detect incomplete multipart streams
+        where the last segment is still missing data.
 
         Format errors or exceeded limits will trigger :exc:`MultipartError`.
         """
@@ -423,6 +426,7 @@ def parse(
                         tail = buffer[next_start - 2 : next_start]
 
                         if tail == b"\r\n":  # Normal delimiter found
+                            self._on_segment_start()
                             self._state = _HEADER
                             offset = next_start
                             continue
@@ -449,12 +453,12 @@ def parse(
                     nl = buffer.find(b"\r\n", offset)
 
                     if nl > offset:  # Non-empty header line
-                        self._current._add_headerline(buffer[offset:nl])
+                        self._on_segment_headerline(buffer[offset:nl])
                         offset = nl + 2
                         continue
                     elif nl == offset:  # Empty header line -> End of header section
-                        self._current._close_headers()
-                        yield self._current
+                        self._segment = self._create_segment(self._segment_headerlist)
+                        yield self._segment
                         self._state = _BODY
                         offset += 2
                         continue
@@ -481,18 +485,17 @@ def parse(
 
                         if tail == b"\r\n" or tail == b"--":
                             if index > offset:
-                                self._current._update_size(index - offset)
-                                yield buffer[offset:index]
+                                yield self._on_segment_payload(buffer[offset:index])
 
                             offset = next_start
-                            self._current._mark_complete()
-                            yield None  # End of segment
+                            self._on_segment_complete()
+                            yield None  # end of segment
 
                             if tail == b"--":  # Last delimiter
                                 self._state = _COMPLETE
                                 break
                             else:  # Normal delimiter
-                                self._current = MultipartSegment(self)
+                                self._on_segment_start()
                                 self._state = _HEADER
                                 continue
 
@@ -500,8 +503,7 @@ def parse(
                     # the end, but emit the rest.
                     chunk_end = bufferlen - (d_len + 1)
                     assert chunk_end > offset  # Always true
-                    self._current._update_size(chunk_end - offset)
-                    yield buffer[offset:chunk_end]
+                    yield self._on_segment_payload(buffer[offset:chunk_end])
                     offset = chunk_end
                     break  # wait for more data
 
@@ -519,6 +521,100 @@ def parse(
             self.close(check_complete=False)
             raise
 
+    def _on_segment_start(self):
+        """Reset internal state to start a new segment"""
+        self._segment_count += 1
+        if self._segment_count > self.max_segment_count:
+            raise ParserLimitReached("Maximum segment count exceeded")
+
+        self._segment = None
+        self._segment_headerlist = []
+        self._segment_limit = -1
+
+    def _on_segment_headerline(self, line: Union[bytes, bytearray]):
+        """Parse a raw segment header line, which may be a continuation of a
+        previous line in non-strict mode."""
+        assert line and self._segment is None
+
+        # Handle header continuation (headers split into multiple lines)
+        if line[0] in b" \t":  # Multi-line header value
+            if not self._segment_headerlist or self.strict:
+                raise StrictParserError("Unexpected segment header continuation")
+            prev = ": ".join(self._segment_headerlist.pop())
+            line = prev.encode(self.header_charset) + b" " + line.strip()
+
+        # Enforce header limits
+        if len(line) > self.max_header_size:
+            raise ParserLimitReached("Maximum segment header length exceeded")
+        if len(self._segment_headerlist) >= self.max_header_count:
+            raise ParserLimitReached("Maximum segment header count exceeded")
+
+        # Decode headers into header name and value
+        try:
+            name, col, value = line.decode(self.header_charset).partition(":")
+            name = name.strip().title()
+            if not col or not name:
+                raise ParserError("Malformed segment header")
+            if " " in name or not name.isascii() or not name.isprintable():
+                raise ParserError("Invalid segment header name")
+            value = value.strip()
+        except UnicodeDecodeError as err:
+            raise ParserError("Segment header failed to decode", err)
+
+        if name == "Content-Length":
+            if not value.isdecimal():
+                raise ParserError("Invalid segment Content-Length header value")
+            content_length = int(value)
+            if content_length > self.max_segment_size:
+                raise ParserLimitReached(
+                    "Segment Content-Length larger than maximum segment size"
+                )
+            self._segment_limit = content_length
+
+        self._segment_headerlist.append((name, value))
+
+    def _create_segment(self, headerlist: List[Tuple[str, str]]):
+        """Create a :class:`MultipartSegment` from a list of headers and check
+        for missing or invalid headers.
+
+        This implementation is specific 'multipart/form-data' and will reject
+        segments with missing or invalid `Content-Disposition` headers or header
+        options.
+
+        Subclasses can override this method to support other multipart stream
+        types (e.g. multipart/byteranges) with different restrictions.
+        """
+        segment = MultipartSegment(headerlist)
+
+        if segment.disposition != "form-data":
+            if segment.disposition is None:
+                raise ParserError("Missing Content-Disposition segment header")
+            raise ParserError("Invalid Content-Disposition segment header: Wrong type")
+        if segment.name is None:
+            segment.name = ""
+            if self.strict:
+                raise StrictParserError(
+                    "Invalid Content-Disposition segment header: Missing name option"
+                )
+
+        return segment
+
+    def _on_segment_payload(self, chunk: Union[bytes, bytearray]):
+        assert self._segment is not None and not self._segment.complete
+        self._segment.size += len(chunk)
+        if self._segment.size > self.max_segment_size:
+            raise ParserLimitReached("Maximum segment size exceeded")
+        if -1 < self._segment_limit < self._segment.size:
+            raise ParserError("Segment Content-Length exceeded")
+        return chunk
+
+    def _on_segment_complete(self):
+        assert self._segment is not None and not self._segment.complete
+        if self._segment.size < self._segment_limit:
+            raise ParserError("Segment size does not match Content-Length header")
+        self._segment.complete = True
+        return None
+
     def close(self, check_complete=True):
         """
         Close this parser if not already closed.
@@ -538,119 +634,62 @@ def close(self, check_complete=True):
 
 
 class MultipartSegment:
-    """A :class:`MultipartSegment` represents the header section of a single
-    multipart part and provides convenient access to part headers and other
-    details (e.g. :attr:`name` and :attr:`filename`). Each segment also tracks
-    its own content :attr:`size` while the :class:`PushMultipartParser`
-    processes more data, and is marked as :attr:`complete` as soon as the next
-    multipart border is found. Segments do not store or buffer any of their
-    content data, though.
+    """Representation of the header section of a single multipart segment.
+
+    :class:`MultipartSegment` instances do not store or buffer any payload data,
+    but the parser will update the payload :attr:`size` property while parsing,
+    and mark the segment as :attr:`complete` when done.
     """
 
-    #: List of headers as name/value pairs with normalized (Title-Case) names.
+    #: Ordered list of headers as (name, value) pairs. Header names are
+    #: normalized (Title-Case) and values are stripped of leading or tailing
+    #: whitespace.
     headerlist: List[Tuple[str, str]]
-    #: The 'name' option of the `Content-Disposition` header. Always a string,
-    #: but may be empty.
-    name: str
+
+    #: The cleaned up `Content-Disposition` header value without any header
+    #: options. This will always be 'form-data' in HTTP multipart contexts.
+    disposition: Optional[str]
+    #: The 'name' option of the `Content-Disposition` header. For `form-data`
+    #: this will always be a string, but the string may be empty.
+    name: Optional[str]
     #: The optional 'filename' option of the `Content-Disposition` header.
     filename: Optional[str]
-    #: The cleaned up `Content-Type` segment header, if present. The value is
-    #: lower-cased and header options (e.g. charset) are removed.
+
+    #: The cleaned up `Content-Type` segment header without any header options.
     content_type: Optional[str]
-    #: The 'charset' option of the `Content-Type` header, if present.
+    #: The optional 'charset' option of the `Content-Type` header.
     charset: Optional[str]
 
-    #: Segment body size (so far). Will be updated during parsing.
+    #: Segment body size (so far). Will be updated for each chunk of payload
+    #: during parsing.
     size: int
-    #: If true, the segment content was fully parsed and the size value is final.
+    #: True if the parser detected the end of the segment and no more payload
+    #: chunks are to be expected.
     complete: bool
 
-    def __init__(self, parser: PushMultipartParser):
-        """Private constructor, used by :class:`PushMultipartParser`"""
-        self._parser = parser
+    def __init__(self, headerlist: List[Tuple[str, str]]):
+        """Private constructor used by :class:`PushMultipartParser`"""
 
-        if parser._fieldcount + 1 > parser.max_segment_count:
-            raise ParserLimitReached("Maximum segment count exceeded")
-        parser._fieldcount += 1
-
-        self.headerlist = []
         self.size = 0
         self.complete = False
 
-        self.name = None  # type: ignore
+        self.headerlist = headerlist
+        self.disposition = None
+        self.name = None
         self.filename = None
         self.content_type = None
         self.charset = None
-        self._clen = -1
-        self._size_limit = parser.max_segment_size
-
-    def _add_headerline(self, line: Union[bytes, bytearray]):
-        assert line and self.name is None
-        parser = self._parser
 
-        if line[0] in b" \t":  # Multi-line header value
-            if not self.headerlist or parser.strict:
-                raise StrictParserError("Unexpected segment header continuation")
-            prev = ": ".join(self.headerlist.pop())
-            line = prev.encode(parser.header_charset) + b" " + line.strip()
-
-        if len(line) > parser.max_header_size:
-            raise ParserLimitReached("Maximum segment header length exceeded")
-        if len(self.headerlist) >= parser.max_header_count:
-            raise ParserLimitReached("Maximum segment header count exceeded")
-
-        try:
-            name, col, value = line.decode(parser.header_charset).partition(":")
-            name = name.strip()
-            if not col or not name:
-                raise ParserError("Malformed segment header")
-            if " " in name or not name.isascii() or not name.isprintable():
-                raise ParserError("Invalid segment header name")
-        except UnicodeDecodeError as err:
-            raise ParserError("Segment header failed to decode", err)
-
-        self.headerlist.append((name.title(), value.strip()))
-
-    def _close_headers(self):
-        assert self.name is None
-
-        for h, v in self.headerlist:
-            if h == "Content-Disposition":
-                dtype, args = parse_options_header(
-                    v, unquote=content_disposition_unquote
+        for name, value in headerlist:
+            if name == "Content-Disposition":
+                self.disposition, args = parse_options_header(
+                    value, unquote=content_disposition_unquote
                 )
-                if dtype != "form-data":
-                    raise ParserError(
-                        "Invalid Content-Disposition segment header: Wrong type"
-                    )
-                if "name" not in args and self._parser.strict:
-                    raise StrictParserError(
-                        "Invalid Content-Disposition segment header: Missing name option"
-                    )
-                self.name = args.get("name", "")
+                self.name = args.get("name")
                 self.filename = args.get("filename")
-            elif h == "Content-Type":
-                self.content_type, args = parse_options_header(v)
+            elif name == "Content-Type":
+                self.content_type, args = parse_options_header(value)
                 self.charset = args.get("charset")
-            elif h == "Content-Length" and v.isdecimal():
-                self._clen = int(v)
-
-        if self.name is None:
-            raise ParserError("Missing Content-Disposition segment header")
-
-    def _update_size(self, bytecount: int):
-        assert self.name is not None and not self.complete
-        self.size += bytecount
-        if self._clen >= 0 and self.size > self._clen:
-            raise ParserError("Segment Content-Length exceeded")
-        if self.size > self._size_limit:
-            raise ParserLimitReached("Maximum segment size exceeded")
-
-    def _mark_complete(self):
-        assert self.name is not None and not self.complete
-        if self._clen >= 0 and self.size != self._clen:
-            raise ParserError("Segment size does not match Content-Length header")
-        self.complete = True
 
     def header(self, name: str, default=None):
         """Return the value of a header if present, or a default value."""

From 18eaf29439c04f71da78ef4c4fab9a015a78bfa4 Mon Sep 17 00:00:00 2001
From: Marcel Hellkamp <marc@gsites.de>
Date: Mon, 7 Jul 2025 14:05:34 +0200
Subject: [PATCH 2/2] perf: Avoid checks for known header names

---
 multipart.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/multipart.py b/multipart.py
index c6f4733..638814e 100644
--- a/multipart.py
+++ b/multipart.py
@@ -287,6 +287,8 @@ def parse_options_header(header, options=None, unquote=header_unquote):
 ##############################################################################
 
 
+# Constants used by the parser
+_HEADER_EXPECTED = frozenset(["Content-Disposition", "Content-Type", "Content-Length"])
 # Parser states as constants
 _PREAMBLE = "PREAMBLE"
 _HEADER = "HEADER"
@@ -555,8 +557,9 @@ def _on_segment_headerline(self, line: Union[bytes, bytearray]):
             name = name.strip().title()
             if not col or not name:
                 raise ParserError("Malformed segment header")
-            if " " in name or not name.isascii() or not name.isprintable():
-                raise ParserError("Invalid segment header name")
+            if name not in _HEADER_EXPECTED:
+                if " " in name or not name.isascii() or not name.isprintable():
+                    raise ParserError("Invalid segment header name")
             value = value.strip()
         except UnicodeDecodeError as err:
             raise ParserError("Segment header failed to decode", err)