Skip to content

Commit aa25803

Browse files
committed
Move character encoding and exceptions code to separate files
Additionally, remove iconv_codecs as a recommended module
1 parent 8e57ebb commit aa25803

4 files changed

Lines changed: 330 additions & 267 deletions

File tree

NEWS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
coming in the next release:
22
* Convert feedparser from a monolithic file to a package
33
* Unify the codebase so that 2to3 conversion is no longer required
4+
* Remove references to iconv_codecs
45
* Update the Creative Commons namespace URI's
56
* Update the default User-Agent name and URL
67
* Catch EOFError when using gzip in Python 3.4

feedparser/__init__.py

Lines changed: 2 additions & 267 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from __future__ import absolute_import, unicode_literals
2-
import collections
32

43
"""Universal feed parser
54
@@ -9,7 +8,6 @@
98
Visit http://packages.python.org/feedparser/ for the latest documentation
109
1110
Required: Python 2.6 or later
12-
Recommended: iconv_codec <http://cjkpython.i18n.org/>
1311
"""
1412

1513
__version__ = "5.2.0"
@@ -74,14 +72,6 @@
7472
SANITIZE_HTML = 1
7573

7674
# ---------- Python 3 modules (make it work if possible) ----------
77-
try:
78-
# Python 3.1 introduces bytes.maketrans and simultaneously
79-
# deprecates string.maketrans; use bytes.maketrans if possible
80-
_maketrans = bytes.maketrans
81-
except (NameError, AttributeError):
82-
import string
83-
_maketrans = string.maketrans
84-
8575
# base64 support for Atom feeds that contain embedded binary data
8676
try:
8777
import base64, binascii
@@ -92,15 +82,9 @@
9282
_base64decode = getattr(base64, 'decodebytes', base64.decodestring)
9383

9484
# ---------- required modules (should come with any Python distribution) ----------
95-
import cgi
96-
import codecs
9785
import copy
98-
import datetime
99-
import itertools
10086
import re
10187
import struct
102-
import time
103-
import types
10488

10589
try:
10690
from html.entities import name2codepoint, entitydefs
@@ -168,21 +152,9 @@ def _xmlescape(data,entities={}):
168152
else:
169153
_XML_AVAILABLE = 1
170154

171-
# iconv_codec provides support for more character encodings.
172-
# It's available from http://cjkpython.i18n.org/
173-
try:
174-
import iconv_codec
175-
except ImportError:
176-
pass
177-
178-
# chardet library auto-detects character encodings
179-
# Download from http://chardet.feedparser.org/
180-
try:
181-
import chardet
182-
except ImportError:
183-
chardet = None
184-
185155
from .datetimes import registerDateHandler, _parse_date
156+
from .encodings import convert_to_utf8
157+
from .exceptions import *
186158
from .html import _BaseHTMLProcessor, _cp1252
187159
from .http import _build_urllib2_request, _FeedURLHandler
188160
from .namespaces import cc, dc, georss, itunes, mediarss, psc
@@ -191,13 +163,6 @@ def _xmlescape(data,entities={}):
191163
from .urls import _urljoin, _convert_to_idn, _makeSafeAbsoluteURI, _resolveRelativeURIs
192164
from .util import FeedParserDict
193165

194-
# ---------- don't touch these ----------
195-
class ThingsNobodyCaresAboutButMe(Exception): pass
196-
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
197-
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
198-
class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
199-
class UndeclaredNamespace(Exception): pass
200-
201166
SUPPORTED_VERSIONS = {'': 'unknown',
202167
'rss090': 'RSS 0.90',
203168
'rss091n': 'RSS 0.91 (Netscape)',
@@ -1591,235 +1556,6 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
15911556
return _StringIO(url_file_stream_or_string.encode('utf-8'))
15921557
return _StringIO(url_file_stream_or_string)
15931558

1594-
# Each marker represents some of the characters of the opening XML
1595-
# processing instruction ('<?xm') in the specified encoding.
1596-
EBCDIC_MARKER = b'\x4C\x6F\xA7\x94'
1597-
UTF16BE_MARKER = b'\x00\x3C\x00\x3F'
1598-
UTF16LE_MARKER = b'\x3C\x00\x3F\x00'
1599-
UTF32BE_MARKER = b'\x00\x00\x00\x3C'
1600-
UTF32LE_MARKER = b'\x3C\x00\x00\x00'
1601-
1602-
ZERO_BYTES = '\x00\x00'
1603-
1604-
# Match the opening XML declaration.
1605-
# Example: <?xml version="1.0" encoding="utf-8"?>
1606-
RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
1607-
1608-
# Capture the value of the XML processing instruction's encoding attribute.
1609-
# Example: <?xml version="1.0" encoding="utf-8"?>
1610-
RE_XML_PI_ENCODING = re.compile(b'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
1611-
1612-
def convert_to_utf8(http_headers, data):
1613-
'''Detect and convert the character encoding to UTF-8.
1614-
1615-
http_headers is a dictionary
1616-
data is a raw string (not Unicode)'''
1617-
1618-
# This is so much trickier than it sounds, it's not even funny.
1619-
# According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
1620-
# is application/xml, application/*+xml,
1621-
# application/xml-external-parsed-entity, or application/xml-dtd,
1622-
# the encoding given in the charset parameter of the HTTP Content-Type
1623-
# takes precedence over the encoding given in the XML prefix within the
1624-
# document, and defaults to 'utf-8' if neither are specified. But, if
1625-
# the HTTP Content-Type is text/xml, text/*+xml, or
1626-
# text/xml-external-parsed-entity, the encoding given in the XML prefix
1627-
# within the document is ALWAYS IGNORED and only the encoding given in
1628-
# the charset parameter of the HTTP Content-Type header should be
1629-
# respected, and it defaults to 'us-ascii' if not specified.
1630-
1631-
# Furthermore, discussion on the atom-syntax mailing list with the
1632-
# author of RFC 3023 leads me to the conclusion that any document
1633-
# served with a Content-Type of text/* and no charset parameter
1634-
# must be treated as us-ascii. (We now do this.) And also that it
1635-
# must always be flagged as non-well-formed. (We now do this too.)
1636-
1637-
# If Content-Type is unspecified (input was local file or non-HTTP source)
1638-
# or unrecognized (server just got it totally wrong), then go by the
1639-
# encoding given in the XML prefix of the document and default to
1640-
# 'iso-8859-1' as per the HTTP specification (RFC 2616).
1641-
1642-
# Then, assuming we didn't find a character encoding in the HTTP headers
1643-
# (and the HTTP Content-type allowed us to look in the body), we need
1644-
# to sniff the first few bytes of the XML data and try to determine
1645-
# whether the encoding is ASCII-compatible. Section F of the XML
1646-
# specification shows the way here:
1647-
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
1648-
1649-
# If the sniffed encoding is not ASCII-compatible, we need to make it
1650-
# ASCII compatible so that we can sniff further into the XML declaration
1651-
# to find the encoding attribute, which will tell us the true encoding.
1652-
1653-
# Of course, none of this guarantees that we will be able to parse the
1654-
# feed in the declared character encoding (assuming it was declared
1655-
# correctly, which many are not). iconv_codec can help a lot;
1656-
# you should definitely install it if you can.
1657-
# http://cjkpython.i18n.org/
1658-
1659-
bom_encoding = ''
1660-
xml_encoding = ''
1661-
rfc3023_encoding = ''
1662-
1663-
# Look at the first few bytes of the document to guess what
1664-
# its encoding may be. We only need to decode enough of the
1665-
# document that we can use an ASCII-compatible regular
1666-
# expression to search for an XML encoding declaration.
1667-
# The heuristic follows the XML specification, section F:
1668-
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
1669-
# Check for BOMs first.
1670-
if data[:4] == codecs.BOM_UTF32_BE:
1671-
bom_encoding = 'utf-32be'
1672-
data = data[4:]
1673-
elif data[:4] == codecs.BOM_UTF32_LE:
1674-
bom_encoding = 'utf-32le'
1675-
data = data[4:]
1676-
elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
1677-
bom_encoding = 'utf-16be'
1678-
data = data[2:]
1679-
elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
1680-
bom_encoding = 'utf-16le'
1681-
data = data[2:]
1682-
elif data[:3] == codecs.BOM_UTF8:
1683-
bom_encoding = 'utf-8'
1684-
data = data[3:]
1685-
# Check for the characters '<?xm' in several encodings.
1686-
elif data[:4] == EBCDIC_MARKER:
1687-
bom_encoding = 'cp037'
1688-
elif data[:4] == UTF16BE_MARKER:
1689-
bom_encoding = 'utf-16be'
1690-
elif data[:4] == UTF16LE_MARKER:
1691-
bom_encoding = 'utf-16le'
1692-
elif data[:4] == UTF32BE_MARKER:
1693-
bom_encoding = 'utf-32be'
1694-
elif data[:4] == UTF32LE_MARKER:
1695-
bom_encoding = 'utf-32le'
1696-
1697-
tempdata = data
1698-
try:
1699-
if bom_encoding:
1700-
tempdata = data.decode(bom_encoding).encode('utf-8')
1701-
except (UnicodeDecodeError, LookupError):
1702-
# feedparser recognizes UTF-32 encodings that aren't
1703-
# available in Python 2.4 and 2.5, so it's possible to
1704-
# encounter a LookupError during decoding.
1705-
xml_encoding_match = None
1706-
else:
1707-
xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
1708-
1709-
if xml_encoding_match:
1710-
xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
1711-
# Normalize the xml_encoding if necessary.
1712-
if bom_encoding and (xml_encoding in (
1713-
'u16', 'utf-16', 'utf16', 'utf_16',
1714-
'u32', 'utf-32', 'utf32', 'utf_32',
1715-
'iso-10646-ucs-2', 'iso-10646-ucs-4',
1716-
'csucs4', 'csunicode', 'ucs-2', 'ucs-4'
1717-
)):
1718-
xml_encoding = bom_encoding
1719-
1720-
# Find the HTTP Content-Type and, hopefully, a character
1721-
# encoding provided by the server. The Content-Type is used
1722-
# to choose the "correct" encoding among the BOM encoding,
1723-
# XML declaration encoding, and HTTP encoding, following the
1724-
# heuristic defined in RFC 3023.
1725-
http_content_type = http_headers.get('content-type') or ''
1726-
http_content_type, params = cgi.parse_header(http_content_type)
1727-
http_encoding = params.get('charset', '').replace("'", "")
1728-
if isinstance(http_encoding, bytes_):
1729-
http_encoding = http_encoding.decode('utf-8', 'ignore')
1730-
1731-
acceptable_content_type = 0
1732-
application_content_types = ('application/xml', 'application/xml-dtd',
1733-
'application/xml-external-parsed-entity')
1734-
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
1735-
if (http_content_type in application_content_types) or \
1736-
(http_content_type.startswith('application/') and
1737-
http_content_type.endswith('+xml')):
1738-
acceptable_content_type = 1
1739-
rfc3023_encoding = http_encoding or xml_encoding or 'utf-8'
1740-
elif (http_content_type in text_content_types) or \
1741-
(http_content_type.startswith('text/') and
1742-
http_content_type.endswith('+xml')):
1743-
acceptable_content_type = 1
1744-
rfc3023_encoding = http_encoding or 'us-ascii'
1745-
elif http_content_type.startswith('text/'):
1746-
rfc3023_encoding = http_encoding or 'us-ascii'
1747-
elif http_headers and 'content-type' not in http_headers:
1748-
rfc3023_encoding = xml_encoding or 'iso-8859-1'
1749-
else:
1750-
rfc3023_encoding = xml_encoding or 'utf-8'
1751-
# gb18030 is a superset of gb2312, so always replace gb2312
1752-
# with gb18030 for greater compatibility.
1753-
if rfc3023_encoding.lower() == 'gb2312':
1754-
rfc3023_encoding = 'gb18030'
1755-
if xml_encoding.lower() == 'gb2312':
1756-
xml_encoding = 'gb18030'
1757-
1758-
# there are four encodings to keep track of:
1759-
# - http_encoding is the encoding declared in the Content-Type HTTP header
1760-
# - xml_encoding is the encoding declared in the <?xml declaration
1761-
# - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
1762-
# - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
1763-
error = None
1764-
1765-
if http_headers and (not acceptable_content_type):
1766-
if 'content-type' in http_headers:
1767-
msg = '%s is not an XML media type' % http_headers['content-type']
1768-
else:
1769-
msg = 'no Content-type specified'
1770-
error = NonXMLContentType(msg)
1771-
1772-
# determine character encoding
1773-
known_encoding = 0
1774-
lazy_chardet_encoding = None
1775-
tried_encodings = []
1776-
if chardet:
1777-
def lazy_chardet_encoding():
1778-
chardet_encoding = chardet.detect(data)['encoding']
1779-
if not chardet_encoding:
1780-
chardet_encoding = ''
1781-
if isinstance(chardet_encoding, bytes_):
1782-
chardet_encoding = chardet_encoding.encode('ascii', 'ignore')
1783-
return chardet_encoding
1784-
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
1785-
for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
1786-
lazy_chardet_encoding, 'utf-8', 'windows-1252', 'iso-8859-2'):
1787-
if isinstance(proposed_encoding, collections.Callable):
1788-
proposed_encoding = proposed_encoding()
1789-
if not proposed_encoding:
1790-
continue
1791-
if proposed_encoding in tried_encodings:
1792-
continue
1793-
tried_encodings.append(proposed_encoding)
1794-
try:
1795-
data = data.decode(proposed_encoding)
1796-
except (UnicodeDecodeError, LookupError):
1797-
pass
1798-
else:
1799-
known_encoding = 1
1800-
# Update the encoding in the opening XML processing instruction.
1801-
new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
1802-
if RE_XML_DECLARATION.search(data):
1803-
data = RE_XML_DECLARATION.sub(new_declaration, data)
1804-
else:
1805-
data = new_declaration + '\n' + data
1806-
data = data.encode('utf-8')
1807-
break
1808-
# if still no luck, give up
1809-
if not known_encoding:
1810-
error = CharacterEncodingUnknown(
1811-
'document encoding unknown, I tried ' +
1812-
'%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
1813-
(rfc3023_encoding, xml_encoding))
1814-
rfc3023_encoding = ''
1815-
elif proposed_encoding != rfc3023_encoding:
1816-
error = CharacterEncodingOverride(
1817-
'document declared as %s, but parsed as %s' %
1818-
(rfc3023_encoding, proposed_encoding))
1819-
rfc3023_encoding = proposed_encoding
1820-
1821-
return data, rfc3023_encoding, error
1822-
18231559
# Match XML entity declarations.
18241560
# Example: <!ENTITY copyright "(C)">
18251561
RE_ENTITY_PATTERN = re.compile(br'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
@@ -1878,7 +1614,6 @@ def replace_doctype(data):
18781614
return version, data, safe_entities
18791615

18801616

1881-
18821617
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
18831618
'''Parse a feed from a URL, file, stream, or string.
18841619

0 commit comments

Comments
 (0)