olivia-fl
diff --git a/‎NEWS‎
Lines changed: 1 addition & 0 deletions b/‎NEWS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎feedparser/__init__.py‎
Lines changed: 2 additions & 267 deletions b/‎feedparser/__init__.py‎
Lines changed: 2 additions & 267 deletions
@@ -1,6 +1,7 @@
 coming in the next release:
     * Convert feedparser from a monolithic file to a package
     * Unify the codebase so that 2to3 conversion is no longer required
+    * Remove references to iconv_codecs
     * Update the Creative Commons namespace URI's
     * Update the default User-Agent name and URL
     * Catch EOFError when using gzip in Python 3.4
 
@@ -1,5 +1,4 @@
 from __future__ import absolute_import, unicode_literals
-import collections
 
 """Universal feed parser
 
@@ -9,7 +8,6 @@
 Visit http://packages.python.org/feedparser/ for the latest documentation
 
 Required: Python 2.6 or later
-Recommended: iconv_codec <http://cjkpython.i18n.org/>
 """
 
 __version__ = "5.2.0"
@@ -74,14 +72,6 @@
 SANITIZE_HTML = 1
 
 # ---------- Python 3 modules (make it work if possible) ----------
-try:
-    # Python 3.1 introduces bytes.maketrans and simultaneously
-    # deprecates string.maketrans; use bytes.maketrans if possible
-    _maketrans = bytes.maketrans
-except (NameError, AttributeError):
-    import string
-    _maketrans = string.maketrans
-
 # base64 support for Atom feeds that contain embedded binary data
 try:
     import base64, binascii
@@ -92,15 +82,9 @@
     _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
 
 # ---------- required modules (should come with any Python distribution) ----------
-import cgi
-import codecs
 import copy
-import datetime
-import itertools
 import re
 import struct
-import time
-import types
 
 try:
     from html.entities import name2codepoint, entitydefs
@@ -168,21 +152,9 @@ def _xmlescape(data,entities={}):
     else:
         _XML_AVAILABLE = 1
 
-# iconv_codec provides support for more character encodings.
-# It's available from http://cjkpython.i18n.org/
-try:
-    import iconv_codec
-except ImportError:
-    pass
-
-# chardet library auto-detects character encodings
-# Download from http://chardet.feedparser.org/
-try:
-    import chardet
-except ImportError:
-    chardet = None
-
 from .datetimes import registerDateHandler, _parse_date
+from .encodings import convert_to_utf8
+from .exceptions import *
 from .html import _BaseHTMLProcessor, _cp1252
 from .http import _build_urllib2_request, _FeedURLHandler
 from .namespaces import cc, dc, georss, itunes, mediarss, psc
@@ -191,13 +163,6 @@ def _xmlescape(data,entities={}):
 from .urls import _urljoin, _convert_to_idn, _makeSafeAbsoluteURI, _resolveRelativeURIs
 from .util import FeedParserDict
 
-# ---------- don't touch these ----------
-class ThingsNobodyCaresAboutButMe(Exception): pass
-class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
-class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
-class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
-class UndeclaredNamespace(Exception): pass
-
 SUPPORTED_VERSIONS = {'': 'unknown',
                       'rss090': 'RSS 0.90',
                       'rss091n': 'RSS 0.91 (Netscape)',
@@ -1591,235 +1556,6 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h
         return _StringIO(url_file_stream_or_string.encode('utf-8'))
     return _StringIO(url_file_stream_or_string)
 
-# Each marker represents some of the characters of the opening XML
-# processing instruction ('<?xm') in the specified encoding.
-EBCDIC_MARKER = b'\x4C\x6F\xA7\x94'
-UTF16BE_MARKER = b'\x00\x3C\x00\x3F'
-UTF16LE_MARKER = b'\x3C\x00\x3F\x00'
-UTF32BE_MARKER = b'\x00\x00\x00\x3C'
-UTF32LE_MARKER = b'\x3C\x00\x00\x00'
-
-ZERO_BYTES = '\x00\x00'
-
-# Match the opening XML declaration.
-# Example: <?xml version="1.0" encoding="utf-8"?>
-RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
-
-# Capture the value of the XML processing instruction's encoding attribute.
-# Example: <?xml version="1.0" encoding="utf-8"?>
-RE_XML_PI_ENCODING = re.compile(b'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
-
-def convert_to_utf8(http_headers, data):
-    '''Detect and convert the character encoding to UTF-8.
-
-    http_headers is a dictionary
-    data is a raw string (not Unicode)'''
-
-    # This is so much trickier than it sounds, it's not even funny.
-    # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
-    # is application/xml, application/*+xml,
-    # application/xml-external-parsed-entity, or application/xml-dtd,
-    # the encoding given in the charset parameter of the HTTP Content-Type
-    # takes precedence over the encoding given in the XML prefix within the
-    # document, and defaults to 'utf-8' if neither are specified.  But, if
-    # the HTTP Content-Type is text/xml, text/*+xml, or
-    # text/xml-external-parsed-entity, the encoding given in the XML prefix
-    # within the document is ALWAYS IGNORED and only the encoding given in
-    # the charset parameter of the HTTP Content-Type header should be
-    # respected, and it defaults to 'us-ascii' if not specified.
-
-    # Furthermore, discussion on the atom-syntax mailing list with the
-    # author of RFC 3023 leads me to the conclusion that any document
-    # served with a Content-Type of text/* and no charset parameter
-    # must be treated as us-ascii.  (We now do this.)  And also that it
-    # must always be flagged as non-well-formed.  (We now do this too.)
-
-    # If Content-Type is unspecified (input was local file or non-HTTP source)
-    # or unrecognized (server just got it totally wrong), then go by the
-    # encoding given in the XML prefix of the document and default to
-    # 'iso-8859-1' as per the HTTP specification (RFC 2616).
-
-    # Then, assuming we didn't find a character encoding in the HTTP headers
-    # (and the HTTP Content-type allowed us to look in the body), we need
-    # to sniff the first few bytes of the XML data and try to determine
-    # whether the encoding is ASCII-compatible.  Section F of the XML
-    # specification shows the way here:
-    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
-
-    # If the sniffed encoding is not ASCII-compatible, we need to make it
-    # ASCII compatible so that we can sniff further into the XML declaration
-    # to find the encoding attribute, which will tell us the true encoding.
-
-    # Of course, none of this guarantees that we will be able to parse the
-    # feed in the declared character encoding (assuming it was declared
-    # correctly, which many are not).  iconv_codec can help a lot;
-    # you should definitely install it if you can.
-    # http://cjkpython.i18n.org/
-
-    bom_encoding = ''
-    xml_encoding = ''
-    rfc3023_encoding = ''
-
-    # Look at the first few bytes of the document to guess what
-    # its encoding may be. We only need to decode enough of the
-    # document that we can use an ASCII-compatible regular
-    # expression to search for an XML encoding declaration.
-    # The heuristic follows the XML specification, section F:
-    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
-    # Check for BOMs first.
-    if data[:4] == codecs.BOM_UTF32_BE:
-        bom_encoding = 'utf-32be'
-        data = data[4:]
-    elif data[:4] == codecs.BOM_UTF32_LE:
-        bom_encoding = 'utf-32le'
-        data = data[4:]
-    elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
-        bom_encoding = 'utf-16be'
-        data = data[2:]
-    elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
-        bom_encoding = 'utf-16le'
-        data = data[2:]
-    elif data[:3] == codecs.BOM_UTF8:
-        bom_encoding = 'utf-8'
-        data = data[3:]
-    # Check for the characters '<?xm' in several encodings.
-    elif data[:4] == EBCDIC_MARKER:
-        bom_encoding = 'cp037'
-    elif data[:4] == UTF16BE_MARKER:
-        bom_encoding = 'utf-16be'
-    elif data[:4] == UTF16LE_MARKER:
-        bom_encoding = 'utf-16le'
-    elif data[:4] == UTF32BE_MARKER:
-        bom_encoding = 'utf-32be'
-    elif data[:4] == UTF32LE_MARKER:
-        bom_encoding = 'utf-32le'
-
-    tempdata = data
-    try:
-        if bom_encoding:
-            tempdata = data.decode(bom_encoding).encode('utf-8')
-    except (UnicodeDecodeError, LookupError):
-        # feedparser recognizes UTF-32 encodings that aren't
-        # available in Python 2.4 and 2.5, so it's possible to
-        # encounter a LookupError during decoding.
-        xml_encoding_match = None
-    else:
-        xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
-
-    if xml_encoding_match:
-        xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
-        # Normalize the xml_encoding if necessary.
-        if bom_encoding and (xml_encoding in (
-            'u16', 'utf-16', 'utf16', 'utf_16',
-            'u32', 'utf-32', 'utf32', 'utf_32',
-            'iso-10646-ucs-2', 'iso-10646-ucs-4',
-            'csucs4', 'csunicode', 'ucs-2', 'ucs-4'
-        )):
-            xml_encoding = bom_encoding
-
-    # Find the HTTP Content-Type and, hopefully, a character
-    # encoding provided by the server. The Content-Type is used
-    # to choose the "correct" encoding among the BOM encoding,
-    # XML declaration encoding, and HTTP encoding, following the
-    # heuristic defined in RFC 3023.
-    http_content_type = http_headers.get('content-type') or ''
-    http_content_type, params = cgi.parse_header(http_content_type)
-    http_encoding = params.get('charset', '').replace("'", "")
-    if isinstance(http_encoding, bytes_):
-        http_encoding = http_encoding.decode('utf-8', 'ignore')
-
-    acceptable_content_type = 0
-    application_content_types = ('application/xml', 'application/xml-dtd',
-                                 'application/xml-external-parsed-entity')
-    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
-    if (http_content_type in application_content_types) or \
-       (http_content_type.startswith('application/') and
-        http_content_type.endswith('+xml')):
-        acceptable_content_type = 1
-        rfc3023_encoding = http_encoding or xml_encoding or 'utf-8'
-    elif (http_content_type in text_content_types) or \
-         (http_content_type.startswith('text/') and
-          http_content_type.endswith('+xml')):
-        acceptable_content_type = 1
-        rfc3023_encoding = http_encoding or 'us-ascii'
-    elif http_content_type.startswith('text/'):
-        rfc3023_encoding = http_encoding or 'us-ascii'
-    elif http_headers and 'content-type' not in http_headers:
-        rfc3023_encoding = xml_encoding or 'iso-8859-1'
-    else:
-        rfc3023_encoding = xml_encoding or 'utf-8'
-    # gb18030 is a superset of gb2312, so always replace gb2312
-    # with gb18030 for greater compatibility.
-    if rfc3023_encoding.lower() == 'gb2312':
-        rfc3023_encoding = 'gb18030'
-    if xml_encoding.lower() == 'gb2312':
-        xml_encoding = 'gb18030'
-
-    # there are four encodings to keep track of:
-    # - http_encoding is the encoding declared in the Content-Type HTTP header
-    # - xml_encoding is the encoding declared in the <?xml declaration
-    # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
-    # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
-    error = None
-
-    if http_headers and (not acceptable_content_type):
-        if 'content-type' in http_headers:
-            msg = '%s is not an XML media type' % http_headers['content-type']
-        else:
-            msg = 'no Content-type specified'
-        error = NonXMLContentType(msg)
-
-    # determine character encoding
-    known_encoding = 0
-    lazy_chardet_encoding = None
-    tried_encodings = []
-    if chardet:
-        def lazy_chardet_encoding():
-            chardet_encoding = chardet.detect(data)['encoding']
-            if not chardet_encoding:
-                chardet_encoding = ''
-            if isinstance(chardet_encoding, bytes_):
-                chardet_encoding = chardet_encoding.encode('ascii', 'ignore')
-            return chardet_encoding
-    # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
-    for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
-                              lazy_chardet_encoding, 'utf-8', 'windows-1252', 'iso-8859-2'):
-        if isinstance(proposed_encoding, collections.Callable):
-            proposed_encoding = proposed_encoding()
-        if not proposed_encoding:
-            continue
-        if proposed_encoding in tried_encodings:
-            continue
-        tried_encodings.append(proposed_encoding)
-        try:
-            data = data.decode(proposed_encoding)
-        except (UnicodeDecodeError, LookupError):
-            pass
-        else:
-            known_encoding = 1
-            # Update the encoding in the opening XML processing instruction.
-            new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
-            if RE_XML_DECLARATION.search(data):
-                data = RE_XML_DECLARATION.sub(new_declaration, data)
-            else:
-                data = new_declaration + '\n' + data
-            data = data.encode('utf-8')
-            break
-    # if still no luck, give up
-    if not known_encoding:
-        error = CharacterEncodingUnknown(
-            'document encoding unknown, I tried ' +
-            '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
-            (rfc3023_encoding, xml_encoding))
-        rfc3023_encoding = ''
-    elif proposed_encoding != rfc3023_encoding:
-        error = CharacterEncodingOverride(
-            'document declared as %s, but parsed as %s' %
-            (rfc3023_encoding, proposed_encoding))
-        rfc3023_encoding = proposed_encoding
-
-    return data, rfc3023_encoding, error
-
 # Match XML entity declarations.
 # Example: <!ENTITY copyright "(C)">
 RE_ENTITY_PATTERN = re.compile(br'^\s*<!ENTITY([^>]*?)>', re.MULTILINE)
@@ -1878,7 +1614,6 @@ def replace_doctype(data):
     return version, data, safe_entities
 
 
-
 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
     '''Parse a feed from a URL, file, stream, or string.