Skip to content

Commit 1e4a5b2

Browse files
committed
Migrate more HTTP code
1 parent f5932d3 commit 1e4a5b2

File tree

4 files changed

+158
-181
lines changed

4 files changed

+158
-181
lines changed

feedparser/__init__.py

Lines changed: 25 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,6 @@
5454
# change this to your application name and URL.
5555
USER_AGENT = "feedparser/%s +https://github.com/kurtmckee/feedparser/" % __version__
5656

57-
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
58-
# want to send an Accept header, set this to None.
59-
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
60-
6157
# List of preferred XML parsers, by SAX driver name. These will be tried first,
6258
# but if they're not installed, Python will keep searching through its own list
6359
# of pre-installed parsers until it finds one that supports everything we need.
@@ -84,7 +80,6 @@
8480
# ---------- required modules (should come with any Python distribution) ----------
8581
import copy
8682
import re
87-
import struct
8883

8984
try:
9085
from html.entities import name2codepoint, entitydefs
@@ -101,33 +96,15 @@
10196

10297
try:
10398
import urllib.parse
104-
import urllib.request
10599
except ImportError:
106-
from urllib import splithost, splittype, splituser
107-
from urllib2 import build_opener
108100
from urlparse import urlparse
109101

110102
class urllib(object):
111103
class parse(object):
112-
splithost = staticmethod(splithost)
113-
splittype = staticmethod(splittype)
114-
splituser = staticmethod(splituser)
115104
urlparse = staticmethod(urlparse)
116-
class request(object):
117-
build_opener = staticmethod(build_opener)
118105

119106
# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
120107

121-
# gzip is included with most Python distributions, but may not be available if you compiled your own
122-
try:
123-
import gzip
124-
except ImportError:
125-
gzip = None
126-
try:
127-
import zlib
128-
except ImportError:
129-
zlib = None
130-
131108
# If a real XML parser is available, feedparser will attempt to use it. feedparser has
132109
# been tested with the built-in SAX parser and libxml2. On platforms where the
133110
# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
@@ -156,7 +133,7 @@ def _xmlescape(data,entities={}):
156133
from .encodings import convert_to_utf8
157134
from .exceptions import *
158135
from .html import _BaseHTMLProcessor, _cp1252
159-
from .http import _build_urllib2_request, _FeedURLHandler
136+
from . import http
160137
from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
161138
from .sanitizer import _sanitizeHTML, _HTMLSanitizer
162139
from .sgml import *
@@ -1029,7 +1006,7 @@ def decodeEntities(self, element, data):
10291006
def strattrs(self, attrs):
10301007
return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
10311008

1032-
def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
1009+
def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result):
10331010
"""URL, filename, or string --> stream
10341011
10351012
This function lets you define parsers that take any input source
@@ -1064,44 +1041,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
10641041
"""
10651042

10661043
if hasattr(url_file_stream_or_string, 'read'):
1067-
return url_file_stream_or_string
1044+
return url_file_stream_or_string.read()
10681045

10691046
if isinstance(url_file_stream_or_string, basestring) \
10701047
and urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
1071-
# Deal with the feed URI scheme
1072-
if url_file_stream_or_string.startswith('feed:http'):
1073-
url_file_stream_or_string = url_file_stream_or_string[5:]
1074-
elif url_file_stream_or_string.startswith('feed:'):
1075-
url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
1076-
if not agent:
1077-
agent = USER_AGENT
1078-
# Test for inline user:password credentials for HTTP basic auth
1079-
auth = None
1080-
if base64 and not url_file_stream_or_string.startswith('ftp:'):
1081-
urltype, rest = urllib.parse.splittype(url_file_stream_or_string)
1082-
realhost, rest = urllib.parse.splithost(rest)
1083-
if realhost:
1084-
user_passwd, realhost = urllib.parse.splituser(realhost)
1085-
if user_passwd:
1086-
url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1087-
auth = base64.standard_b64encode(user_passwd).strip()
1088-
1089-
# iri support
1090-
if not isinstance(url_file_stream_or_string, bytes_):
1091-
url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
1092-
1093-
# try to open with urllib2 (to use optional headers)
1094-
request = _build_urllib2_request(url_file_stream_or_string, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
1095-
opener = urllib.request.build_opener(*tuple(handlers + [_FeedURLHandler()]))
1096-
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1097-
try:
1098-
return opener.open(request)
1099-
finally:
1100-
opener.close() # JohnD
1048+
return http.get(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
11011049

11021050
# try to open with native open function (if url_file_stream_or_string is a filename)
11031051
try:
1104-
return open(url_file_stream_or_string, 'rb')
1052+
with open(url_file_stream_or_string, 'rb') as f:
1053+
data = f.read()
11051054
except (IOError, UnicodeEncodeError, TypeError):
11061055
# if url_file_stream_or_string is a unicode object that
11071056
# cannot be converted to the encoding returned by
@@ -1111,11 +1060,13 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
11111060
# (such as an XML document encoded in UTF-32), TypeError will
11121061
# be thrown.
11131062
pass
1063+
else:
1064+
return data
11141065

11151066
# treat url_file_stream_or_string as string
11161067
if not isinstance(url_file_stream_or_string, bytes_):
1117-
return _StringIO(url_file_stream_or_string.encode('utf-8'))
1118-
return _StringIO(url_file_stream_or_string)
1068+
return url_file_stream_or_string.encode('utf-8')
1069+
return url_file_stream_or_string
11191070

11201071
# Match XML entity declarations.
11211072
# Example: <!ENTITY copyright "(C)">
@@ -1175,7 +1126,7 @@ def replace_doctype(data):
11751126
return version, data, safe_entities
11761127

11771128

1178-
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
1129+
def parse(url_file_stream_or_string, etag=None, modified=None, agent=USER_AGENT, referrer=None, handlers=None, request_headers=None, response_headers=None):
11791130
'''Parse a feed from a URL, file, stream, or string.
11801131
11811132
request_headers, if given, is a dict from http header name to value to add
@@ -1184,116 +1135,32 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
11841135
:return: A :class:`FeedParserDict`.
11851136
'''
11861137

1187-
if handlers is None:
1188-
handlers = []
1189-
if request_headers is None:
1190-
request_headers = {}
1191-
if response_headers is None:
1192-
response_headers = {}
1193-
1194-
result = FeedParserDict()
1195-
result['feed'] = FeedParserDict()
1196-
result['entries'] = []
1197-
result['bozo'] = 0
1198-
if not isinstance(handlers, list):
1199-
handlers = [handlers]
1200-
try:
1201-
f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
1202-
data = f.read()
1203-
except Exception as e:
1204-
result['bozo'] = 1
1205-
result['bozo_exception'] = e
1206-
data = None
1207-
f = None
1208-
1209-
if hasattr(f, 'headers'):
1210-
result['headers'] = dict(f.headers)
1211-
# overwrite existing headers using response_headers
1212-
if 'headers' in result:
1213-
result['headers'].update(response_headers)
1214-
elif response_headers:
1215-
result['headers'] = copy.deepcopy(response_headers)
1216-
1217-
# lowercase all of the HTTP headers for comparisons per RFC 2616
1218-
if 'headers' in result:
1219-
http_headers = dict((k.lower(), v) for k, v in result['headers'].items())
1220-
else:
1221-
http_headers = {}
1138+
result = FeedParserDict(
1139+
bozo = False,
1140+
entries = [],
1141+
feed = FeedParserDict(),
1142+
headers = {},
1143+
)
12221144

1223-
# if feed is gzip-compressed, decompress it
1224-
if f and data and http_headers:
1225-
if gzip and 'gzip' in http_headers.get('content-encoding', ''):
1226-
try:
1227-
data = gzip.GzipFile(fileobj=_StringIO(data)).read()
1228-
except (EOFError, IOError, struct.error) as e:
1229-
# IOError can occur if the gzip header is bad.
1230-
# struct.error can occur if the data is damaged.
1231-
result['bozo'] = 1
1232-
result['bozo_exception'] = e
1233-
if isinstance(e, struct.error):
1234-
# A gzip header was found but the data is corrupt.
1235-
# Ideally, we should re-request the feed without the
1236-
# 'Accept-encoding: gzip' header, but we don't.
1237-
data = None
1238-
elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
1239-
try:
1240-
data = zlib.decompress(data)
1241-
except zlib.error as e:
1242-
try:
1243-
# The data may have no headers and no checksum.
1244-
data = zlib.decompress(data, -15)
1245-
except zlib.error as e:
1246-
result['bozo'] = 1
1247-
result['bozo_exception'] = e
1248-
1249-
# save HTTP headers
1250-
if http_headers:
1251-
if 'etag' in http_headers:
1252-
etag = http_headers.get('etag', '')
1253-
if isinstance(etag, bytes_):
1254-
etag = etag.decode('utf-8', 'ignore')
1255-
if etag:
1256-
result['etag'] = etag
1257-
if 'last-modified' in http_headers:
1258-
modified = http_headers.get('last-modified', '')
1259-
if modified:
1260-
result['modified'] = modified
1261-
result['modified_parsed'] = _parse_date(modified)
1262-
if hasattr(f, 'url'):
1263-
if isinstance(f.url, bytes_):
1264-
result['href'] = f.url.decode('utf-8', 'ignore')
1265-
else:
1266-
result['href'] = f.url
1267-
result['status'] = 200
1268-
if hasattr(f, 'status'):
1269-
result['status'] = f.status
1270-
if hasattr(f, 'close'):
1271-
f.close()
1272-
1273-
if data is None:
1274-
return result
1145+
data = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)
12751146

1276-
# Stop processing if the server sent HTTP 304 Not Modified.
1277-
if getattr(f, 'code', 0) == 304:
1278-
result['version'] = ''
1279-
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
1280-
'so the server sent no data. This is a feature, not a bug!'
1147+
if not data:
12811148
return result
12821149

1283-
data, result['encoding'], error = convert_to_utf8(http_headers, data)
1150+
# overwrite existing headers using response_headers
1151+
result['headers'].update(response_headers or {})
1152+
1153+
data = convert_to_utf8(result['headers'], data, result)
12841154
use_strict_parser = result['encoding'] and True or False
1285-
if error is not None:
1286-
result['bozo'] = 1
1287-
result['bozo_exception'] = error
12881155

12891156
result['version'], data, entities = replace_doctype(data)
12901157

12911158
# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
1292-
contentloc = http_headers.get('content-location', '')
1159+
contentloc = result['headers'].get('content-location', '')
12931160
href = result.get('href', '')
12941161
baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
12951162

1296-
baselang = http_headers.get('content-language', None)
1163+
baselang = result['headers'].get('content-language', None)
12971164
if isinstance(baselang, bytes_) and baselang is not None:
12981165
baselang = baselang.decode('utf-8', 'ignore')
12991166

feedparser/encodings.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def lazy_chardet_encoding():
7272
# Example: <?xml version="1.0" encoding="utf-8"?>
7373
RE_XML_PI_ENCODING = re.compile(b'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
7474

75-
def convert_to_utf8(http_headers, data):
75+
def convert_to_utf8(http_headers, data, result):
7676
'''Detect and convert the character encoding to UTF-8.
7777
7878
http_headers is a dictionary
@@ -272,4 +272,8 @@ def convert_to_utf8(http_headers, data):
272272
(rfc3023_encoding, proposed_encoding))
273273
rfc3023_encoding = proposed_encoding
274274

275-
return data, rfc3023_encoding, error
275+
result['encoding'] = rfc3023_encoding
276+
if error:
277+
result['bozo'] = True
278+
result['bozo_exception'] = error
279+
return data

0 commit comments

Comments
 (0)