Skip to content

Commit b282ff9

Browse files
Allow lax response parsing on Py parser (#7663) (#7665)
(cherry picked from commit bd5f924)
1 parent 85713a4 commit b282ff9

File tree

4 files changed

+187
-85
lines changed

4 files changed

+187
-85
lines changed

CHANGES/7663.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Updated Python parser to comply with latest HTTP specs and allow lax response parsing -- by :user:`Dreamorcerer`

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ aiohttp/_find_header.c: $(call to-hash,aiohttp/hdrs.py ./tools/gen.py)
5858

5959
# _find_headers generator creates _headers.pyi as well
6060
aiohttp/%.c: aiohttp/%.pyx $(call to-hash,$(CYS)) aiohttp/_find_header.c
61-
cython -3 -o $@ $< -I aiohttp
61+
cython -3 -o $@ $< -I aiohttp -Werror
6262

6363
vendor/llhttp/node_modules: vendor/llhttp/package.json
6464
cd vendor/llhttp; npm install

aiohttp/http_parser.py

Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
from enum import IntEnum
77
from typing import (
88
Any,
9+
ClassVar,
910
Final,
1011
Generic,
1112
List,
13+
Literal,
1214
NamedTuple,
1315
Optional,
1416
Pattern,
@@ -25,7 +27,7 @@
2527
from . import hdrs
2628
from .base_protocol import BaseProtocol
2729
from .compression_utils import HAS_BROTLI, BrotliDecompressor, ZLibDecompressor
28-
from .helpers import NO_EXTENSIONS, BaseTimerContext
30+
from .helpers import DEBUG, NO_EXTENSIONS, BaseTimerContext
2931
from .http_exceptions import (
3032
BadHttpMessage,
3133
BadStatusLine,
@@ -49,6 +51,8 @@
4951
"RawResponseMessage",
5052
)
5153

54+
_SEP = Literal[b"\r\n", b"\n"]
55+
5256
ASCIISET: Final[Set[str]] = set(string.printable)
5357

5458
# See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview
@@ -61,6 +65,7 @@
6165
METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
6266
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
6367
HDRRE: Final[Pattern[bytes]] = re.compile(rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\"\\]")
68+
HEXDIGIT = re.compile(rb"[0-9a-fA-F]+")
6469

6570

6671
class RawRequestMessage(NamedTuple):
@@ -210,6 +215,8 @@ def parse_headers(
210215

211216

212217
class HttpParser(abc.ABC, Generic[_MsgT]):
218+
lax: ClassVar[bool] = False
219+
213220
def __init__(
214221
self,
215222
protocol: Optional[BaseProtocol] = None,
@@ -272,7 +279,7 @@ def feed_eof(self) -> Optional[_MsgT]:
272279
def feed_data(
273280
self,
274281
data: bytes,
275-
SEP: bytes = b"\r\n",
282+
SEP: _SEP = b"\r\n",
276283
EMPTY: bytes = b"",
277284
CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH,
278285
METH_CONNECT: str = hdrs.METH_CONNECT,
@@ -296,13 +303,16 @@ def feed_data(
296303
pos = data.find(SEP, start_pos)
297304
# consume \r\n
298305
if pos == start_pos and not self._lines:
299-
start_pos = pos + 2
306+
start_pos = pos + len(SEP)
300307
continue
301308

302309
if pos >= start_pos:
303310
# line found
304-
self._lines.append(data[start_pos:pos])
305-
start_pos = pos + 2
311+
line = data[start_pos:pos]
312+
if SEP == b"\n": # For lax response parsing
313+
line = line.rstrip(b"\r")
314+
self._lines.append(line)
315+
start_pos = pos + len(SEP)
306316

307317
# \r\n\r\n found
308318
if self._lines[-1] == EMPTY:
@@ -319,7 +329,7 @@ def get_content_length() -> Optional[int]:
319329

320330
# Shouldn't allow +/- or other number formats.
321331
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
322-
if not length_hdr.strip(" \t").isdigit():
332+
if not length_hdr.strip(" \t").isdecimal():
323333
raise InvalidHeader(CONTENT_LENGTH)
324334

325335
return int(length_hdr)
@@ -356,6 +366,7 @@ def get_content_length() -> Optional[int]:
356366
readall=self.readall,
357367
response_with_body=self.response_with_body,
358368
auto_decompress=self._auto_decompress,
369+
lax=self.lax,
359370
)
360371
if not payload_parser.done:
361372
self._payload_parser = payload_parser
@@ -374,6 +385,7 @@ def get_content_length() -> Optional[int]:
374385
compression=msg.compression,
375386
readall=True,
376387
auto_decompress=self._auto_decompress,
388+
lax=self.lax,
377389
)
378390
else:
379391
if (
@@ -397,6 +409,7 @@ def get_content_length() -> Optional[int]:
397409
readall=True,
398410
response_with_body=self.response_with_body,
399411
auto_decompress=self._auto_decompress,
412+
lax=self.lax,
400413
)
401414
if not payload_parser.done:
402415
self._payload_parser = payload_parser
@@ -419,7 +432,7 @@ def get_content_length() -> Optional[int]:
419432
assert not self._lines
420433
assert self._payload_parser is not None
421434
try:
422-
eof, data = self._payload_parser.feed_data(data[start_pos:])
435+
eof, data = self._payload_parser.feed_data(data[start_pos:], SEP)
423436
except BaseException as exc:
424437
if self.payload_exception is not None:
425438
self._payload_parser.payload.set_exception(
@@ -614,6 +627,20 @@ class HttpResponseParser(HttpParser[RawResponseMessage]):
614627
Returns RawResponseMessage.
615628
"""
616629

630+
# Lax mode should only be enabled on response parser.
631+
lax = not DEBUG
632+
633+
def feed_data(
634+
self,
635+
data: bytes,
636+
SEP: Optional[_SEP] = None,
637+
*args: Any,
638+
**kwargs: Any,
639+
) -> Tuple[List[Tuple[RawResponseMessage, StreamReader]], bool, bytes]:
640+
if SEP is None:
641+
SEP = b"\r\n" if DEBUG else b"\n"
642+
return super().feed_data(data, SEP, *args, **kwargs)
643+
617644
def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
618645
line = lines[0].decode("utf-8", "surrogateescape")
619646
try:
@@ -638,7 +665,7 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
638665
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
639666

640667
# The status code is a three-digit number
641-
if len(status) != 3 or not status.isdigit():
668+
if len(status) != 3 or not status.isdecimal():
642669
raise BadStatusLine(line)
643670
status_i = int(status)
644671

@@ -680,13 +707,15 @@ def __init__(
680707
readall: bool = False,
681708
response_with_body: bool = True,
682709
auto_decompress: bool = True,
710+
lax: bool = False,
683711
) -> None:
684712
self._length = 0
685713
self._type = ParseState.PARSE_NONE
686714
self._chunk = ChunkState.PARSE_CHUNKED_SIZE
687715
self._chunk_size = 0
688716
self._chunk_tail = b""
689717
self._auto_decompress = auto_decompress
718+
self._lax = lax
690719
self.done = False
691720

692721
# payload decompression wrapper
@@ -738,7 +767,7 @@ def feed_eof(self) -> None:
738767
)
739768

740769
def feed_data(
741-
self, chunk: bytes, SEP: bytes = b"\r\n", CHUNK_EXT: bytes = b";"
770+
self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";"
742771
) -> Tuple[bool, bytes]:
743772
# Read specified amount of bytes
744773
if self._type == ParseState.PARSE_LENGTH:
@@ -775,17 +804,22 @@ def feed_data(
775804
else:
776805
size_b = chunk[:pos]
777806

778-
if not size_b.isdigit():
807+
if self._lax: # Allow whitespace in lax mode.
808+
size_b = size_b.strip()
809+
810+
if not re.fullmatch(HEXDIGIT, size_b):
779811
exc = TransferEncodingError(
780812
chunk[:pos].decode("ascii", "surrogateescape")
781813
)
782814
self.payload.set_exception(exc)
783815
raise exc
784816
size = int(bytes(size_b), 16)
785817

786-
chunk = chunk[pos + 2 :]
818+
chunk = chunk[pos + len(SEP) :]
787819
if size == 0: # eof marker
788820
self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
821+
if self._lax and chunk.startswith(b"\r"):
822+
chunk = chunk[1:]
789823
else:
790824
self._chunk = ChunkState.PARSE_CHUNKED_CHUNK
791825
self._chunk_size = size
@@ -807,13 +841,15 @@ def feed_data(
807841
self._chunk_size = 0
808842
self.payload.feed_data(chunk[:required], required)
809843
chunk = chunk[required:]
844+
if self._lax and chunk.startswith(b"\r"):
845+
chunk = chunk[1:]
810846
self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF
811847
self.payload.end_http_chunk_receiving()
812848

813849
# toss the CRLF at the end of the chunk
814850
if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF:
815-
if chunk[:2] == SEP:
816-
chunk = chunk[2:]
851+
if chunk[: len(SEP)] == SEP:
852+
chunk = chunk[len(SEP) :]
817853
self._chunk = ChunkState.PARSE_CHUNKED_SIZE
818854
else:
819855
self._chunk_tail = chunk
@@ -823,11 +859,11 @@ def feed_data(
823859
# we should get another \r\n otherwise
824860
# trailers needs to be skiped until \r\n\r\n
825861
if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS:
826-
head = chunk[:2]
862+
head = chunk[: len(SEP)]
827863
if head == SEP:
828864
# end of stream
829865
self.payload.feed_eof()
830-
return True, chunk[2:]
866+
return True, chunk[len(SEP) :]
831867
# Both CR and LF, or only LF may not be received yet. It is
832868
# expected that CRLF or LF will be shown at the very first
833869
# byte next time, otherwise trailers should come. The last
@@ -845,7 +881,7 @@ def feed_data(
845881
if self._chunk == ChunkState.PARSE_TRAILERS:
846882
pos = chunk.find(SEP)
847883
if pos >= 0:
848-
chunk = chunk[pos + 2 :]
884+
chunk = chunk[pos + len(SEP) :]
849885
self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
850886
else:
851887
self._chunk_tail = chunk

0 commit comments

Comments
 (0)