5454# change this to your application name and URL.
5555USER_AGENT = "feedparser/%s +https://github.com/kurtmckee/feedparser/" % __version__
5656
57- # HTTP "Accept" header to send to servers when downloading feeds. If you don't
58- # want to send an Accept header, set this to None.
59- ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
60-
6157# List of preferred XML parsers, by SAX driver name. These will be tried first,
6258# but if they're not installed, Python will keep searching through its own list
6359# of pre-installed parsers until it finds one that supports everything we need.
8480# ---------- required modules (should come with any Python distribution) ----------
8581import copy
8682import re
87- import struct
8883
8984try :
9085 from html .entities import name2codepoint , entitydefs
10196
10297try :
10398 import urllib .parse
104- import urllib .request
10599except ImportError :
106- from urllib import splithost , splittype , splituser
107- from urllib2 import build_opener
108100 from urlparse import urlparse
109101
110102 class urllib (object ):
111103 class parse (object ):
112- splithost = staticmethod (splithost )
113- splittype = staticmethod (splittype )
114- splituser = staticmethod (splituser )
115104 urlparse = staticmethod (urlparse )
116- class request (object ):
117- build_opener = staticmethod (build_opener )
118105
119106# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
120107
121- # gzip is included with most Python distributions, but may not be available if you compiled your own
122- try :
123- import gzip
124- except ImportError :
125- gzip = None
126- try :
127- import zlib
128- except ImportError :
129- zlib = None
130-
131108# If a real XML parser is available, feedparser will attempt to use it. feedparser has
132109# been tested with the built-in SAX parser and libxml2. On platforms where the
133110# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
@@ -156,7 +133,7 @@ def _xmlescape(data,entities={}):
156133from .encodings import convert_to_utf8
157134from .exceptions import *
158135from .html import _BaseHTMLProcessor , _cp1252
159- from .http import _build_urllib2_request , _FeedURLHandler
136+ from . import http
160137from .namespaces import _base , cc , dc , georss , itunes , mediarss , psc
161138from .sanitizer import _sanitizeHTML , _HTMLSanitizer
162139from .sgml import *
@@ -1029,7 +1006,7 @@ def decodeEntities(self, element, data):
10291006 def strattrs (self , attrs ):
10301007 return '' .join ([' %s="%s"' % (n ,v .replace ('"' ,'"' )) for n ,v in attrs ])
10311008
1032- def _open_resource (url_file_stream_or_string , etag , modified , agent , referrer , handlers , request_headers ):
1009+ def _open_resource (url_file_stream_or_string , etag , modified , agent , referrer , handlers , request_headers , result ):
10331010 """URL, filename, or string --> stream
10341011
10351012 This function lets you define parsers that take any input source
@@ -1064,44 +1041,16 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
10641041 """
10651042
10661043 if hasattr (url_file_stream_or_string , 'read' ):
1067- return url_file_stream_or_string
1044+ return url_file_stream_or_string . read ()
10681045
10691046 if isinstance (url_file_stream_or_string , basestring ) \
10701047 and urllib .parse .urlparse (url_file_stream_or_string )[0 ] in ('http' , 'https' , 'ftp' , 'file' , 'feed' ):
1071- # Deal with the feed URI scheme
1072- if url_file_stream_or_string .startswith ('feed:http' ):
1073- url_file_stream_or_string = url_file_stream_or_string [5 :]
1074- elif url_file_stream_or_string .startswith ('feed:' ):
1075- url_file_stream_or_string = 'http:' + url_file_stream_or_string [5 :]
1076- if not agent :
1077- agent = USER_AGENT
1078- # Test for inline user:password credentials for HTTP basic auth
1079- auth = None
1080- if base64 and not url_file_stream_or_string .startswith ('ftp:' ):
1081- urltype , rest = urllib .parse .splittype (url_file_stream_or_string )
1082- realhost , rest = urllib .parse .splithost (rest )
1083- if realhost :
1084- user_passwd , realhost = urllib .parse .splituser (realhost )
1085- if user_passwd :
1086- url_file_stream_or_string = '%s://%s%s' % (urltype , realhost , rest )
1087- auth = base64 .standard_b64encode (user_passwd ).strip ()
1088-
1089- # iri support
1090- if not isinstance (url_file_stream_or_string , bytes_ ):
1091- url_file_stream_or_string = _convert_to_idn (url_file_stream_or_string )
1092-
1093- # try to open with urllib2 (to use optional headers)
1094- request = _build_urllib2_request (url_file_stream_or_string , agent , ACCEPT_HEADER , etag , modified , referrer , auth , request_headers )
1095- opener = urllib .request .build_opener (* tuple (handlers + [_FeedURLHandler ()]))
1096- opener .addheaders = [] # RMK - must clear so we only send our custom User-Agent
1097- try :
1098- return opener .open (request )
1099- finally :
1100- opener .close () # JohnD
1048+ return http .get (url_file_stream_or_string , etag , modified , agent , referrer , handlers , request_headers , result )
11011049
11021050 # try to open with native open function (if url_file_stream_or_string is a filename)
11031051 try :
1104- return open (url_file_stream_or_string , 'rb' )
1052+ with open (url_file_stream_or_string , 'rb' ) as f :
1053+ data = f .read ()
11051054 except (IOError , UnicodeEncodeError , TypeError ):
11061055 # if url_file_stream_or_string is a unicode object that
11071056 # cannot be converted to the encoding returned by
@@ -1111,11 +1060,13 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
11111060 # (such as an XML document encoded in UTF-32), TypeError will
11121061 # be thrown.
11131062 pass
1063+ else :
1064+ return data
11141065
11151066 # treat url_file_stream_or_string as string
11161067 if not isinstance (url_file_stream_or_string , bytes_ ):
1117- return _StringIO ( url_file_stream_or_string .encode ('utf-8' ) )
1118- return _StringIO ( url_file_stream_or_string )
1068+ return url_file_stream_or_string .encode ('utf-8' )
1069+ return url_file_stream_or_string
11191070
11201071# Match XML entity declarations.
11211072# Example: <!ENTITY copyright "(C)">
@@ -1175,7 +1126,7 @@ def replace_doctype(data):
11751126 return version , data , safe_entities
11761127
11771128
1178- def parse (url_file_stream_or_string , etag = None , modified = None , agent = None , referrer = None , handlers = None , request_headers = None , response_headers = None ):
1129+ def parse (url_file_stream_or_string , etag = None , modified = None , agent = USER_AGENT , referrer = None , handlers = None , request_headers = None , response_headers = None ):
11791130 '''Parse a feed from a URL, file, stream, or string.
11801131
11811132 request_headers, if given, is a dict from http header name to value to add
@@ -1184,116 +1135,32 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer
11841135 :return: A :class:`FeedParserDict`.
11851136 '''
11861137
1187- if handlers is None :
1188- handlers = []
1189- if request_headers is None :
1190- request_headers = {}
1191- if response_headers is None :
1192- response_headers = {}
1193-
1194- result = FeedParserDict ()
1195- result ['feed' ] = FeedParserDict ()
1196- result ['entries' ] = []
1197- result ['bozo' ] = 0
1198- if not isinstance (handlers , list ):
1199- handlers = [handlers ]
1200- try :
1201- f = _open_resource (url_file_stream_or_string , etag , modified , agent , referrer , handlers , request_headers )
1202- data = f .read ()
1203- except Exception as e :
1204- result ['bozo' ] = 1
1205- result ['bozo_exception' ] = e
1206- data = None
1207- f = None
1208-
1209- if hasattr (f , 'headers' ):
1210- result ['headers' ] = dict (f .headers )
1211- # overwrite existing headers using response_headers
1212- if 'headers' in result :
1213- result ['headers' ].update (response_headers )
1214- elif response_headers :
1215- result ['headers' ] = copy .deepcopy (response_headers )
1216-
1217- # lowercase all of the HTTP headers for comparisons per RFC 2616
1218- if 'headers' in result :
1219- http_headers = dict ((k .lower (), v ) for k , v in result ['headers' ].items ())
1220- else :
1221- http_headers = {}
1138+ result = FeedParserDict (
1139+ bozo = False ,
1140+ entries = [],
1141+ feed = FeedParserDict (),
1142+ headers = {},
1143+ )
12221144
1223- # if feed is gzip-compressed, decompress it
1224- if f and data and http_headers :
1225- if gzip and 'gzip' in http_headers .get ('content-encoding' , '' ):
1226- try :
1227- data = gzip .GzipFile (fileobj = _StringIO (data )).read ()
1228- except (EOFError , IOError , struct .error ) as e :
1229- # IOError can occur if the gzip header is bad.
1230- # struct.error can occur if the data is damaged.
1231- result ['bozo' ] = 1
1232- result ['bozo_exception' ] = e
1233- if isinstance (e , struct .error ):
1234- # A gzip header was found but the data is corrupt.
1235- # Ideally, we should re-request the feed without the
1236- # 'Accept-encoding: gzip' header, but we don't.
1237- data = None
1238- elif zlib and 'deflate' in http_headers .get ('content-encoding' , '' ):
1239- try :
1240- data = zlib .decompress (data )
1241- except zlib .error as e :
1242- try :
1243- # The data may have no headers and no checksum.
1244- data = zlib .decompress (data , - 15 )
1245- except zlib .error as e :
1246- result ['bozo' ] = 1
1247- result ['bozo_exception' ] = e
1248-
1249- # save HTTP headers
1250- if http_headers :
1251- if 'etag' in http_headers :
1252- etag = http_headers .get ('etag' , '' )
1253- if isinstance (etag , bytes_ ):
1254- etag = etag .decode ('utf-8' , 'ignore' )
1255- if etag :
1256- result ['etag' ] = etag
1257- if 'last-modified' in http_headers :
1258- modified = http_headers .get ('last-modified' , '' )
1259- if modified :
1260- result ['modified' ] = modified
1261- result ['modified_parsed' ] = _parse_date (modified )
1262- if hasattr (f , 'url' ):
1263- if isinstance (f .url , bytes_ ):
1264- result ['href' ] = f .url .decode ('utf-8' , 'ignore' )
1265- else :
1266- result ['href' ] = f .url
1267- result ['status' ] = 200
1268- if hasattr (f , 'status' ):
1269- result ['status' ] = f .status
1270- if hasattr (f , 'close' ):
1271- f .close ()
1272-
1273- if data is None :
1274- return result
1145+ data = _open_resource (url_file_stream_or_string , etag , modified , agent , referrer , handlers , request_headers , result )
12751146
1276- # Stop processing if the server sent HTTP 304 Not Modified.
1277- if getattr (f , 'code' , 0 ) == 304 :
1278- result ['version' ] = ''
1279- result ['debug_message' ] = 'The feed has not changed since you last checked, ' + \
1280- 'so the server sent no data. This is a feature, not a bug!'
1147+ if not data :
12811148 return result
12821149
1283- data , result ['encoding' ], error = convert_to_utf8 (http_headers , data )
1150+ # overwrite existing headers using response_headers
1151+ result ['headers' ].update (response_headers or {})
1152+
1153+ data = convert_to_utf8 (result ['headers' ], data , result )
12841154 use_strict_parser = result ['encoding' ] and True or False
1285- if error is not None :
1286- result ['bozo' ] = 1
1287- result ['bozo_exception' ] = error
12881155
12891156 result ['version' ], data , entities = replace_doctype (data )
12901157
12911158 # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
1292- contentloc = http_headers .get ('content-location' , '' )
1159+ contentloc = result [ 'headers' ] .get ('content-location' , '' )
12931160 href = result .get ('href' , '' )
12941161 baseuri = _makeSafeAbsoluteURI (href , contentloc ) or _makeSafeAbsoluteURI (contentloc ) or href
12951162
1296- baselang = http_headers .get ('content-language' , None )
1163+ baselang = result [ 'headers' ] .get ('content-language' , None )
12971164 if isinstance (baselang , bytes_ ) and baselang is not None :
12981165 baselang = baselang .decode ('utf-8' , 'ignore' )
12991166
0 commit comments