11from __future__ import absolute_import , unicode_literals
2- import collections
32
43"""Universal feed parser
54
98Visit http://packages.python.org/feedparser/ for the latest documentation
109
1110Required: Python 2.6 or later
12- Recommended: iconv_codec <http://cjkpython.i18n.org/>
1311"""
1412
1513__version__ = "5.2.0"
7472SANITIZE_HTML = 1
7573
7674# ---------- Python 3 modules (make it work if possible) ----------
77- try :
78- # Python 3.1 introduces bytes.maketrans and simultaneously
79- # deprecates string.maketrans; use bytes.maketrans if possible
80- _maketrans = bytes .maketrans
81- except (NameError , AttributeError ):
82- import string
83- _maketrans = string .maketrans
84-
8575# base64 support for Atom feeds that contain embedded binary data
8676try :
8777 import base64 , binascii
9282 _base64decode = getattr (base64 , 'decodebytes' , base64 .decodestring )
9383
9484# ---------- required modules (should come with any Python distribution) ----------
95- import cgi
96- import codecs
9785import copy
98- import datetime
99- import itertools
10086import re
10187import struct
102- import time
103- import types
10488
10589try :
10690 from html .entities import name2codepoint , entitydefs
@@ -168,21 +152,9 @@ def _xmlescape(data,entities={}):
168152 else :
169153 _XML_AVAILABLE = 1
170154
171- # iconv_codec provides support for more character encodings.
172- # It's available from http://cjkpython.i18n.org/
173- try :
174- import iconv_codec
175- except ImportError :
176- pass
177-
178- # chardet library auto-detects character encodings
179- # Download from http://chardet.feedparser.org/
180- try :
181- import chardet
182- except ImportError :
183- chardet = None
184-
185155from .datetimes import registerDateHandler , _parse_date
156+ from .encodings import convert_to_utf8
157+ from .exceptions import *
186158from .html import _BaseHTMLProcessor , _cp1252
187159from .http import _build_urllib2_request , _FeedURLHandler
188160from .namespaces import cc , dc , georss , itunes , mediarss , psc
@@ -191,13 +163,6 @@ def _xmlescape(data,entities={}):
191163from .urls import _urljoin , _convert_to_idn , _makeSafeAbsoluteURI , _resolveRelativeURIs
192164from .util import FeedParserDict
193165
194- # ---------- don't touch these ----------
195- class ThingsNobodyCaresAboutButMe (Exception ): pass
196- class CharacterEncodingOverride (ThingsNobodyCaresAboutButMe ): pass
197- class CharacterEncodingUnknown (ThingsNobodyCaresAboutButMe ): pass
198- class NonXMLContentType (ThingsNobodyCaresAboutButMe ): pass
199- class UndeclaredNamespace (Exception ): pass
200-
201166SUPPORTED_VERSIONS = {'' : 'unknown' ,
202167 'rss090' : 'RSS 0.90' ,
203168 'rss091n' : 'RSS 0.91 (Netscape)' ,
@@ -1591,235 +1556,6 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
15911556 return _StringIO (url_file_stream_or_string .encode ('utf-8' ))
15921557 return _StringIO (url_file_stream_or_string )
15931558
1594- # Each marker represents some of the characters of the opening XML
1595- # processing instruction ('<?xm') in the specified encoding.
1596- EBCDIC_MARKER = b'\x4C \x6F \xA7 \x94 '
1597- UTF16BE_MARKER = b'\x00 \x3C \x00 \x3F '
1598- UTF16LE_MARKER = b'\x3C \x00 \x3F \x00 '
1599- UTF32BE_MARKER = b'\x00 \x00 \x00 \x3C '
1600- UTF32LE_MARKER = b'\x3C \x00 \x00 \x00 '
1601-
1602- ZERO_BYTES = '\x00 \x00 '
1603-
1604- # Match the opening XML declaration.
1605- # Example: <?xml version="1.0" encoding="utf-8"?>
1606- RE_XML_DECLARATION = re .compile ('^<\?xml[^>]*?>' )
1607-
1608- # Capture the value of the XML processing instruction's encoding attribute.
1609- # Example: <?xml version="1.0" encoding="utf-8"?>
1610- RE_XML_PI_ENCODING = re .compile (b'^<\?.*encoding=[\' "](.*?)[\' "].*\?>' )
1611-
1612- def convert_to_utf8 (http_headers , data ):
1613- '''Detect and convert the character encoding to UTF-8.
1614-
1615- http_headers is a dictionary
1616- data is a raw string (not Unicode)'''
1617-
1618- # This is so much trickier than it sounds, it's not even funny.
1619- # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
1620- # is application/xml, application/*+xml,
1621- # application/xml-external-parsed-entity, or application/xml-dtd,
1622- # the encoding given in the charset parameter of the HTTP Content-Type
1623- # takes precedence over the encoding given in the XML prefix within the
1624- # document, and defaults to 'utf-8' if neither are specified. But, if
1625- # the HTTP Content-Type is text/xml, text/*+xml, or
1626- # text/xml-external-parsed-entity, the encoding given in the XML prefix
1627- # within the document is ALWAYS IGNORED and only the encoding given in
1628- # the charset parameter of the HTTP Content-Type header should be
1629- # respected, and it defaults to 'us-ascii' if not specified.
1630-
1631- # Furthermore, discussion on the atom-syntax mailing list with the
1632- # author of RFC 3023 leads me to the conclusion that any document
1633- # served with a Content-Type of text/* and no charset parameter
1634- # must be treated as us-ascii. (We now do this.) And also that it
1635- # must always be flagged as non-well-formed. (We now do this too.)
1636-
1637- # If Content-Type is unspecified (input was local file or non-HTTP source)
1638- # or unrecognized (server just got it totally wrong), then go by the
1639- # encoding given in the XML prefix of the document and default to
1640- # 'iso-8859-1' as per the HTTP specification (RFC 2616).
1641-
1642- # Then, assuming we didn't find a character encoding in the HTTP headers
1643- # (and the HTTP Content-type allowed us to look in the body), we need
1644- # to sniff the first few bytes of the XML data and try to determine
1645- # whether the encoding is ASCII-compatible. Section F of the XML
1646- # specification shows the way here:
1647- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
1648-
1649- # If the sniffed encoding is not ASCII-compatible, we need to make it
1650- # ASCII compatible so that we can sniff further into the XML declaration
1651- # to find the encoding attribute, which will tell us the true encoding.
1652-
1653- # Of course, none of this guarantees that we will be able to parse the
1654- # feed in the declared character encoding (assuming it was declared
1655- # correctly, which many are not). iconv_codec can help a lot;
1656- # you should definitely install it if you can.
1657- # http://cjkpython.i18n.org/
1658-
1659- bom_encoding = ''
1660- xml_encoding = ''
1661- rfc3023_encoding = ''
1662-
1663- # Look at the first few bytes of the document to guess what
1664- # its encoding may be. We only need to decode enough of the
1665- # document that we can use an ASCII-compatible regular
1666- # expression to search for an XML encoding declaration.
1667- # The heuristic follows the XML specification, section F:
1668- # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
1669- # Check for BOMs first.
1670- if data [:4 ] == codecs .BOM_UTF32_BE :
1671- bom_encoding = 'utf-32be'
1672- data = data [4 :]
1673- elif data [:4 ] == codecs .BOM_UTF32_LE :
1674- bom_encoding = 'utf-32le'
1675- data = data [4 :]
1676- elif data [:2 ] == codecs .BOM_UTF16_BE and data [2 :4 ] != ZERO_BYTES :
1677- bom_encoding = 'utf-16be'
1678- data = data [2 :]
1679- elif data [:2 ] == codecs .BOM_UTF16_LE and data [2 :4 ] != ZERO_BYTES :
1680- bom_encoding = 'utf-16le'
1681- data = data [2 :]
1682- elif data [:3 ] == codecs .BOM_UTF8 :
1683- bom_encoding = 'utf-8'
1684- data = data [3 :]
1685- # Check for the characters '<?xm' in several encodings.
1686- elif data [:4 ] == EBCDIC_MARKER :
1687- bom_encoding = 'cp037'
1688- elif data [:4 ] == UTF16BE_MARKER :
1689- bom_encoding = 'utf-16be'
1690- elif data [:4 ] == UTF16LE_MARKER :
1691- bom_encoding = 'utf-16le'
1692- elif data [:4 ] == UTF32BE_MARKER :
1693- bom_encoding = 'utf-32be'
1694- elif data [:4 ] == UTF32LE_MARKER :
1695- bom_encoding = 'utf-32le'
1696-
1697- tempdata = data
1698- try :
1699- if bom_encoding :
1700- tempdata = data .decode (bom_encoding ).encode ('utf-8' )
1701- except (UnicodeDecodeError , LookupError ):
1702- # feedparser recognizes UTF-32 encodings that aren't
1703- # available in Python 2.4 and 2.5, so it's possible to
1704- # encounter a LookupError during decoding.
1705- xml_encoding_match = None
1706- else :
1707- xml_encoding_match = RE_XML_PI_ENCODING .match (tempdata )
1708-
1709- if xml_encoding_match :
1710- xml_encoding = xml_encoding_match .groups ()[0 ].decode ('utf-8' ).lower ()
1711- # Normalize the xml_encoding if necessary.
1712- if bom_encoding and (xml_encoding in (
1713- 'u16' , 'utf-16' , 'utf16' , 'utf_16' ,
1714- 'u32' , 'utf-32' , 'utf32' , 'utf_32' ,
1715- 'iso-10646-ucs-2' , 'iso-10646-ucs-4' ,
1716- 'csucs4' , 'csunicode' , 'ucs-2' , 'ucs-4'
1717- )):
1718- xml_encoding = bom_encoding
1719-
1720- # Find the HTTP Content-Type and, hopefully, a character
1721- # encoding provided by the server. The Content-Type is used
1722- # to choose the "correct" encoding among the BOM encoding,
1723- # XML declaration encoding, and HTTP encoding, following the
1724- # heuristic defined in RFC 3023.
1725- http_content_type = http_headers .get ('content-type' ) or ''
1726- http_content_type , params = cgi .parse_header (http_content_type )
1727- http_encoding = params .get ('charset' , '' ).replace ("'" , "" )
1728- if isinstance (http_encoding , bytes_ ):
1729- http_encoding = http_encoding .decode ('utf-8' , 'ignore' )
1730-
1731- acceptable_content_type = 0
1732- application_content_types = ('application/xml' , 'application/xml-dtd' ,
1733- 'application/xml-external-parsed-entity' )
1734- text_content_types = ('text/xml' , 'text/xml-external-parsed-entity' )
1735- if (http_content_type in application_content_types ) or \
1736- (http_content_type .startswith ('application/' ) and
1737- http_content_type .endswith ('+xml' )):
1738- acceptable_content_type = 1
1739- rfc3023_encoding = http_encoding or xml_encoding or 'utf-8'
1740- elif (http_content_type in text_content_types ) or \
1741- (http_content_type .startswith ('text/' ) and
1742- http_content_type .endswith ('+xml' )):
1743- acceptable_content_type = 1
1744- rfc3023_encoding = http_encoding or 'us-ascii'
1745- elif http_content_type .startswith ('text/' ):
1746- rfc3023_encoding = http_encoding or 'us-ascii'
1747- elif http_headers and 'content-type' not in http_headers :
1748- rfc3023_encoding = xml_encoding or 'iso-8859-1'
1749- else :
1750- rfc3023_encoding = xml_encoding or 'utf-8'
1751- # gb18030 is a superset of gb2312, so always replace gb2312
1752- # with gb18030 for greater compatibility.
1753- if rfc3023_encoding .lower () == 'gb2312' :
1754- rfc3023_encoding = 'gb18030'
1755- if xml_encoding .lower () == 'gb2312' :
1756- xml_encoding = 'gb18030'
1757-
1758- # there are four encodings to keep track of:
1759- # - http_encoding is the encoding declared in the Content-Type HTTP header
1760- # - xml_encoding is the encoding declared in the <?xml declaration
1761- # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
1762- # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
1763- error = None
1764-
1765- if http_headers and (not acceptable_content_type ):
1766- if 'content-type' in http_headers :
1767- msg = '%s is not an XML media type' % http_headers ['content-type' ]
1768- else :
1769- msg = 'no Content-type specified'
1770- error = NonXMLContentType (msg )
1771-
1772- # determine character encoding
1773- known_encoding = 0
1774- lazy_chardet_encoding = None
1775- tried_encodings = []
1776- if chardet :
1777- def lazy_chardet_encoding ():
1778- chardet_encoding = chardet .detect (data )['encoding' ]
1779- if not chardet_encoding :
1780- chardet_encoding = ''
1781- if isinstance (chardet_encoding , bytes_ ):
1782- chardet_encoding = chardet_encoding .encode ('ascii' , 'ignore' )
1783- return chardet_encoding
1784- # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
1785- for proposed_encoding in (rfc3023_encoding , xml_encoding , bom_encoding ,
1786- lazy_chardet_encoding , 'utf-8' , 'windows-1252' , 'iso-8859-2' ):
1787- if isinstance (proposed_encoding , collections .Callable ):
1788- proposed_encoding = proposed_encoding ()
1789- if not proposed_encoding :
1790- continue
1791- if proposed_encoding in tried_encodings :
1792- continue
1793- tried_encodings .append (proposed_encoding )
1794- try :
1795- data = data .decode (proposed_encoding )
1796- except (UnicodeDecodeError , LookupError ):
1797- pass
1798- else :
1799- known_encoding = 1
1800- # Update the encoding in the opening XML processing instruction.
1801- new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
1802- if RE_XML_DECLARATION .search (data ):
1803- data = RE_XML_DECLARATION .sub (new_declaration , data )
1804- else :
1805- data = new_declaration + '\n ' + data
1806- data = data .encode ('utf-8' )
1807- break
1808- # if still no luck, give up
1809- if not known_encoding :
1810- error = CharacterEncodingUnknown (
1811- 'document encoding unknown, I tried ' +
1812- '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
1813- (rfc3023_encoding , xml_encoding ))
1814- rfc3023_encoding = ''
1815- elif proposed_encoding != rfc3023_encoding :
1816- error = CharacterEncodingOverride (
1817- 'document declared as %s, but parsed as %s' %
1818- (rfc3023_encoding , proposed_encoding ))
1819- rfc3023_encoding = proposed_encoding
1820-
1821- return data , rfc3023_encoding , error
1822-
18231559# Match XML entity declarations.
18241560# Example: <!ENTITY copyright "(C)">
18251561RE_ENTITY_PATTERN = re .compile (br'^\s*<!ENTITY([^>]*?)>' , re .MULTILINE )
@@ -1878,7 +1614,6 @@ def replace_doctype(data):
18781614 return version , data , safe_entities
18791615
18801616
1881-
18821617def parse (url_file_stream_or_string , etag = None , modified = None , agent = None , referrer = None , handlers = None , request_headers = None , response_headers = None ):
18831618 '''Parse a feed from a URL, file, stream, or string.
18841619
0 commit comments