55from contextlib import suppress
66from enum import IntEnum
77from typing import (
8+ Any ,
9+ ClassVar ,
810 Final ,
911 Generic ,
1012 List ,
13+ Literal ,
1114 NamedTuple ,
1215 Optional ,
1316 Pattern ,
2427from . import hdrs
2528from .base_protocol import BaseProtocol
2629from .compression_utils import HAS_BROTLI , BrotliDecompressor , ZLibDecompressor
27- from .helpers import NO_EXTENSIONS , BaseTimerContext
30+ from .helpers import DEBUG , NO_EXTENSIONS , BaseTimerContext
2831from .http_exceptions import (
2932 BadHttpMessage ,
3033 BadStatusLine ,
4851 "RawResponseMessage" ,
4952)
5053
54+ _SEP = Literal [b"\r \n " , b"\n " ]
55+
5156ASCIISET : Final [Set [str ]] = set (string .printable )
5257
5358# See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview
6065METHRE : Final [Pattern [str ]] = re .compile (r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+" )
6166VERSRE : Final [Pattern [str ]] = re .compile (r"HTTP/(\d).(\d)" )
6267HDRRE : Final [Pattern [bytes ]] = re .compile (rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\"\\]" )
68+ HEXDIGIT = re .compile (rb"[0-9a-fA-F]+" )
6369
6470
6571class RawRequestMessage (NamedTuple ):
@@ -206,6 +212,8 @@ def parse_headers(
206212
207213
208214class HttpParser (abc .ABC , Generic [_MsgT ]):
215+ lax : ClassVar [bool ] = False
216+
209217 def __init__ (
210218 self ,
211219 protocol : BaseProtocol ,
@@ -266,7 +274,7 @@ def feed_eof(self) -> Optional[_MsgT]:
266274 def feed_data (
267275 self ,
268276 data : bytes ,
269- SEP : bytes = b"\r \n " ,
277+ SEP : _SEP = b"\r \n " ,
270278 EMPTY : bytes = b"" ,
271279 CONTENT_LENGTH : istr = hdrs .CONTENT_LENGTH ,
272280 METH_CONNECT : str = hdrs .METH_CONNECT ,
@@ -288,13 +296,16 @@ def feed_data(
288296 pos = data .find (SEP , start_pos )
289297 # consume \r\n
290298 if pos == start_pos and not self ._lines :
291- start_pos = pos + 2
299+ start_pos = pos + len ( SEP )
292300 continue
293301
294302 if pos >= start_pos :
295303 # line found
296- self ._lines .append (data [start_pos :pos ])
297- start_pos = pos + 2
304+ line = data [start_pos :pos ]
305+ if SEP == b"\n " : # For lax response parsing
306+ line = line .rstrip (b"\r " )
307+ self ._lines .append (line )
308+ start_pos = pos + len (SEP )
298309
299310 # \r\n\r\n found
300311 if self ._lines [- 1 ] == EMPTY :
@@ -311,7 +322,7 @@ def get_content_length() -> Optional[int]:
311322
312323 # Shouldn't allow +/- or other number formats.
313324 # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
314- if not length_hdr .strip (" \t " ).isdigit ():
325+ if not length_hdr .strip (" \t " ).isdecimal ():
315326 raise InvalidHeader (CONTENT_LENGTH )
316327
317328 return int (length_hdr )
@@ -348,6 +359,7 @@ def get_content_length() -> Optional[int]:
348359 readall = self .readall ,
349360 response_with_body = self .response_with_body ,
350361 auto_decompress = self ._auto_decompress ,
362+ lax = self .lax ,
351363 )
352364 if not payload_parser .done :
353365 self ._payload_parser = payload_parser
@@ -366,6 +378,7 @@ def get_content_length() -> Optional[int]:
366378 compression = msg .compression ,
367379 readall = True ,
368380 auto_decompress = self ._auto_decompress ,
381+ lax = self .lax ,
369382 )
370383 else :
371384 if (
@@ -389,6 +402,7 @@ def get_content_length() -> Optional[int]:
389402 readall = True ,
390403 response_with_body = self .response_with_body ,
391404 auto_decompress = self ._auto_decompress ,
405+ lax = self .lax ,
392406 )
393407 if not payload_parser .done :
394408 self ._payload_parser = payload_parser
@@ -411,7 +425,7 @@ def get_content_length() -> Optional[int]:
411425 assert not self ._lines
412426 assert self ._payload_parser is not None
413427 try :
414- eof , data = self ._payload_parser .feed_data (data [start_pos :])
428+ eof , data = self ._payload_parser .feed_data (data [start_pos :], SEP )
415429 except BaseException as exc :
416430 if self .payload_exception is not None :
417431 self ._payload_parser .payload .set_exception (
@@ -456,12 +470,21 @@ def parse_headers(
456470
457471 # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-6
458472 # https://www.rfc-editor.org/rfc/rfc9110.html#name-collected-abnf
459- singletons = (hdrs .CONTENT_LENGTH , hdrs .CONTENT_LOCATION , hdrs .CONTENT_RANGE ,
460- hdrs .CONTENT_TYPE , hdrs .ETAG , hdrs .HOST , hdrs .MAX_FORWARDS ,
461- hdrs .SERVER , hdrs .TRANSFER_ENCODING , hdrs .USER_AGENT )
473+ singletons = (
474+ hdrs .CONTENT_LENGTH ,
475+ hdrs .CONTENT_LOCATION ,
476+ hdrs .CONTENT_RANGE ,
477+ hdrs .CONTENT_TYPE ,
478+ hdrs .ETAG ,
479+ hdrs .HOST ,
480+ hdrs .MAX_FORWARDS ,
481+ hdrs .SERVER ,
482+ hdrs .TRANSFER_ENCODING ,
483+ hdrs .USER_AGENT ,
484+ )
462485 bad_hdr = next ((h for h in singletons if len (headers .getall (h , ())) > 1 ), None )
463486 if bad_hdr is not None :
464- raise BadHttpMessage ("Duplicate '{}' header found." . format ( bad_hdr ) )
487+ raise BadHttpMessage (f "Duplicate '{ bad_hdr } ' header found." )
465488
466489 # keep-alive
467490 conn = headers .get (hdrs .CONNECTION )
@@ -597,6 +620,20 @@ class HttpResponseParser(HttpParser[RawResponseMessage]):
597620 Returns RawResponseMessage.
598621 """
599622
623+ # Lax mode should only be enabled on response parser.
624+ lax = not DEBUG
625+
626+ def feed_data (
627+ self ,
628+ data : bytes ,
629+ SEP : Optional [_SEP ] = None ,
630+ * args : Any ,
631+ ** kwargs : Any ,
632+ ) -> Tuple [List [Tuple [RawResponseMessage , StreamReader ]], bool , bytes ]:
633+ if SEP is None :
634+ SEP = b"\r \n " if DEBUG else b"\n "
635+ return super ().feed_data (data , SEP , * args , ** kwargs )
636+
600637 def parse_message (self , lines : List [bytes ]) -> RawResponseMessage :
601638 line = lines [0 ].decode ("utf-8" , "surrogateescape" )
602639 try :
@@ -621,7 +658,7 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
621658 version_o = HttpVersion (int (match .group (1 )), int (match .group (2 )))
622659
623660 # The status code is a three-digit number
624- if len (status ) != 3 or not status .isdigit ():
661+ if len (status ) != 3 or not status .isdecimal ():
625662 raise BadStatusLine (line )
626663 status_i = int (status )
627664
@@ -663,13 +700,15 @@ def __init__(
663700 readall : bool = False ,
664701 response_with_body : bool = True ,
665702 auto_decompress : bool = True ,
703+ lax : bool = False ,
666704 ) -> None :
667705 self ._length = 0
668706 self ._type = ParseState .PARSE_NONE
669707 self ._chunk = ChunkState .PARSE_CHUNKED_SIZE
670708 self ._chunk_size = 0
671709 self ._chunk_tail = b""
672710 self ._auto_decompress = auto_decompress
711+ self ._lax = lax
673712 self .done = False
674713
675714 # payload decompression wrapper
@@ -721,7 +760,7 @@ def feed_eof(self) -> None:
721760 )
722761
723762 def feed_data (
724- self , chunk : bytes , SEP : bytes = b"\r \n " , CHUNK_EXT : bytes = b";"
763+ self , chunk : bytes , SEP : _SEP = b"\r \n " , CHUNK_EXT : bytes = b";"
725764 ) -> Tuple [bool , bytes ]:
726765 # Read specified amount of bytes
727766 if self ._type == ParseState .PARSE_LENGTH :
@@ -757,17 +796,22 @@ def feed_data(
757796 else :
758797 size_b = chunk [:pos ]
759798
760- if not size_b .isdigit ():
799+ if self ._lax : # Allow whitespace in lax mode.
800+ size_b = size_b .strip ()
801+
802+ if not re .fullmatch (HEXDIGIT , size_b ):
761803 exc = TransferEncodingError (
762804 chunk [:pos ].decode ("ascii" , "surrogateescape" )
763805 )
764806 self .payload .set_exception (exc )
765807 raise exc
766808 size = int (bytes (size_b ), 16 )
767809
768- chunk = chunk [pos + 2 :]
810+ chunk = chunk [pos + len ( SEP ) :]
769811 if size == 0 : # eof marker
770812 self ._chunk = ChunkState .PARSE_MAYBE_TRAILERS
813+ if self ._lax and chunk .startswith (b"\r " ):
814+ chunk = chunk [1 :]
771815 else :
772816 self ._chunk = ChunkState .PARSE_CHUNKED_CHUNK
773817 self ._chunk_size = size
@@ -789,13 +833,15 @@ def feed_data(
789833 self ._chunk_size = 0
790834 self .payload .feed_data (chunk [:required ], required )
791835 chunk = chunk [required :]
836+ if self ._lax and chunk .startswith (b"\r " ):
837+ chunk = chunk [1 :]
792838 self ._chunk = ChunkState .PARSE_CHUNKED_CHUNK_EOF
793839 self .payload .end_http_chunk_receiving ()
794840
795841 # toss the CRLF at the end of the chunk
796842 if self ._chunk == ChunkState .PARSE_CHUNKED_CHUNK_EOF :
797- if chunk [:2 ] == SEP :
798- chunk = chunk [2 :]
843+ if chunk [: len ( SEP ) ] == SEP :
844+ chunk = chunk [len ( SEP ) :]
799845 self ._chunk = ChunkState .PARSE_CHUNKED_SIZE
800846 else :
801847 self ._chunk_tail = chunk
@@ -805,11 +851,11 @@ def feed_data(
805851 # we should get another \r\n otherwise
806852 # trailers needs to be skiped until \r\n\r\n
807853 if self ._chunk == ChunkState .PARSE_MAYBE_TRAILERS :
808- head = chunk [:2 ]
854+ head = chunk [: len ( SEP ) ]
809855 if head == SEP :
810856 # end of stream
811857 self .payload .feed_eof ()
812- return True , chunk [2 :]
858+ return True , chunk [len ( SEP ) :]
813859 # Both CR and LF, or only LF may not be received yet. It is
814860 # expected that CRLF or LF will be shown at the very first
815861 # byte next time, otherwise trailers should come. The last
@@ -827,7 +873,7 @@ def feed_data(
827873 if self ._chunk == ChunkState .PARSE_TRAILERS :
828874 pos = chunk .find (SEP )
829875 if pos >= 0 :
830- chunk = chunk [pos + 2 :]
876+ chunk = chunk [pos + len ( SEP ) :]
831877 self ._chunk = ChunkState .PARSE_MAYBE_TRAILERS
832878 else :
833879 self ._chunk_tail = chunk
0 commit comments