1096 lines
40 KiB
Python
1096 lines
40 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||
|
||
This library converts a bytestream to Unicode through any means
|
||
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||
"""
|
||
# Use of this source code is governed by the MIT license.
|
||
__license__ = "MIT"
|
||
|
||
from html.entities import codepoint2name
|
||
from collections import defaultdict
|
||
import codecs
|
||
import re
|
||
import logging
|
||
import string
|
||
|
||
# Import a library to autodetect character encodings. We'll support
|
||
# any of a number of libraries that all support the same API:
|
||
#
|
||
# * cchardet
|
||
# * chardet
|
||
# * charset-normalizer
|
||
chardet_module = None
|
||
try:
|
||
# PyPI package: cchardet
|
||
import cchardet as chardet_module
|
||
except ImportError:
|
||
try:
|
||
# Debian package: python-chardet
|
||
# PyPI package: chardet
|
||
import chardet as chardet_module
|
||
except ImportError:
|
||
try:
|
||
# PyPI package: charset-normalizer
|
||
import charset_normalizer as chardet_module
|
||
except ImportError:
|
||
# No chardet available.
|
||
chardet_module = None
|
||
|
||
if chardet_module:
|
||
def chardet_dammit(s):
|
||
if isinstance(s, str):
|
||
return None
|
||
return chardet_module.detect(s)['encoding']
|
||
else:
|
||
def chardet_dammit(s):
|
||
return None
|
||
|
||
# Build bytestring and Unicode versions of regular expressions for finding
|
||
# a declared encoding inside an XML or HTML document.
|
||
xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
|
||
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
|
||
encoding_res = dict()
|
||
encoding_res[bytes] = {
|
||
'html' : re.compile(html_meta.encode("ascii"), re.I),
|
||
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
|
||
}
|
||
encoding_res[str] = {
|
||
'html' : re.compile(html_meta, re.I),
|
||
'xml' : re.compile(xml_encoding, re.I)
|
||
}
|
||
|
||
from html.entities import html5
|
||
|
||
class EntitySubstitution(object):
|
||
"""The ability to substitute XML or HTML entities for certain characters."""
|
||
|
||
def _populate_class_variables():
|
||
"""Initialize variables used by this class to manage the plethora of
|
||
HTML5 named entities.
|
||
|
||
This function returns a 3-tuple containing two dictionaries
|
||
and a regular expression:
|
||
|
||
unicode_to_name - A mapping of Unicode strings like "⦨" to
|
||
entity names like "angmsdaa". When a single Unicode string has
|
||
multiple entity names, we try to choose the most commonly-used
|
||
name.
|
||
|
||
name_to_unicode: A mapping of entity names like "angmsdaa" to
|
||
Unicode strings like "⦨".
|
||
|
||
named_entity_re: A regular expression matching (almost) any
|
||
Unicode string that corresponds to an HTML5 named entity.
|
||
"""
|
||
unicode_to_name = {}
|
||
name_to_unicode = {}
|
||
|
||
short_entities = set()
|
||
long_entities_by_first_character = defaultdict(set)
|
||
|
||
for name_with_semicolon, character in sorted(html5.items()):
|
||
# "It is intentional, for legacy compatibility, that many
|
||
# code points have multiple character reference names. For
|
||
# example, some appear both with and without the trailing
|
||
# semicolon, or with different capitalizations."
|
||
# - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
|
||
#
|
||
# The parsers are in charge of handling (or not) character
|
||
# references with no trailing semicolon, so we remove the
|
||
# semicolon whenever it appears.
|
||
if name_with_semicolon.endswith(';'):
|
||
name = name_with_semicolon[:-1]
|
||
else:
|
||
name = name_with_semicolon
|
||
|
||
# When parsing HTML, we want to recognize any known named
|
||
# entity and convert it to a sequence of Unicode
|
||
# characters.
|
||
if name not in name_to_unicode:
|
||
name_to_unicode[name] = character
|
||
|
||
# When _generating_ HTML, we want to recognize special
|
||
# character sequences that _could_ be converted to named
|
||
# entities.
|
||
unicode_to_name[character] = name
|
||
|
||
# We also need to build a regular expression that lets us
|
||
# _find_ those characters in output strings so we can
|
||
# replace them.
|
||
#
|
||
# This is tricky, for two reasons.
|
||
|
||
if (len(character) == 1 and ord(character) < 128
|
||
and character not in '<>&'):
|
||
# First, it would be annoying to turn single ASCII
|
||
# characters like | into named entities like
|
||
# |. The exceptions are <>&, which we _must_
|
||
# turn into named entities to produce valid HTML.
|
||
continue
|
||
|
||
if len(character) > 1 and all(ord(x) < 128 for x in character):
|
||
# We also do not want to turn _combinations_ of ASCII
|
||
# characters like 'fj' into named entities like 'fj',
|
||
# though that's more debateable.
|
||
continue
|
||
|
||
# Second, some named entities have a Unicode value that's
|
||
# a subset of the Unicode value for some _other_ named
|
||
# entity. As an example, \u2267' is ≧,
|
||
# but '\u2267\u0338' is ≧̸. Our regular
|
||
# expression needs to match the first two characters of
|
||
# "\u2267\u0338foo", but only the first character of
|
||
# "\u2267foo".
|
||
#
|
||
# In this step, we build two sets of characters that
|
||
# _eventually_ need to go into the regular expression. But
|
||
# we won't know exactly what the regular expression needs
|
||
# to look like until we've gone through the entire list of
|
||
# named entities.
|
||
if len(character) == 1:
|
||
short_entities.add(character)
|
||
else:
|
||
long_entities_by_first_character[character[0]].add(character)
|
||
|
||
# Now that we've been through the entire list of entities, we
|
||
# can create a regular expression that matches any of them.
|
||
particles = set()
|
||
for short in short_entities:
|
||
long_versions = long_entities_by_first_character[short]
|
||
if not long_versions:
|
||
particles.add(short)
|
||
else:
|
||
ignore = "".join([x[1] for x in long_versions])
|
||
# This finds, e.g. \u2267 but only if it is _not_
|
||
# followed by \u0338.
|
||
particles.add("%s(?![%s])" % (short, ignore))
|
||
|
||
for long_entities in list(long_entities_by_first_character.values()):
|
||
for long_entity in long_entities:
|
||
particles.add(long_entity)
|
||
|
||
re_definition = "(%s)" % "|".join(particles)
|
||
|
||
# If an entity shows up in both html5 and codepoint2name, it's
|
||
# likely that HTML5 gives it several different names, such as
|
||
# 'rsquo' and 'rsquor'. When converting Unicode characters to
|
||
# named entities, the codepoint2name name should take
|
||
# precedence where possible, since that's the more easily
|
||
# recognizable one.
|
||
for codepoint, name in list(codepoint2name.items()):
|
||
character = chr(codepoint)
|
||
unicode_to_name[character] = name
|
||
|
||
return unicode_to_name, name_to_unicode, re.compile(re_definition)
|
||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||
|
||
CHARACTER_TO_XML_ENTITY = {
|
||
"'": "apos",
|
||
'"': "quot",
|
||
"&": "amp",
|
||
"<": "lt",
|
||
">": "gt",
|
||
}
|
||
|
||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
|
||
")")
|
||
|
||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||
|
||
@classmethod
|
||
def _substitute_html_entity(cls, matchobj):
|
||
"""Used with a regular expression to substitute the
|
||
appropriate HTML entity for a special character string."""
|
||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||
return "&%s;" % entity
|
||
|
||
@classmethod
|
||
def _substitute_xml_entity(cls, matchobj):
|
||
"""Used with a regular expression to substitute the
|
||
appropriate XML entity for a special character string."""
|
||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||
return "&%s;" % entity
|
||
|
||
@classmethod
|
||
def quoted_attribute_value(self, value):
|
||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||
|
||
Most strings will be quoted using double quotes.
|
||
|
||
Bob's Bar -> "Bob's Bar"
|
||
|
||
If a string contains double quotes, it will be quoted using
|
||
single quotes.
|
||
|
||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||
|
||
If a string contains both single and double quotes, the
|
||
double quotes will be escaped, and the string will be quoted
|
||
using double quotes.
|
||
|
||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
||
"""
|
||
quote_with = '"'
|
||
if '"' in value:
|
||
if "'" in value:
|
||
# The string contains both single and double
|
||
# quotes. Turn the double quotes into
|
||
# entities. We quote the double quotes rather than
|
||
# the single quotes because the entity name is
|
||
# """ whether this is HTML or XML. If we
|
||
# quoted the single quotes, we'd have to decide
|
||
# between ' and &squot;.
|
||
replace_with = """
|
||
value = value.replace('"', replace_with)
|
||
else:
|
||
# There are double quotes but no single quotes.
|
||
# We can use single quotes to quote the attribute.
|
||
quote_with = "'"
|
||
return quote_with + value + quote_with
|
||
|
||
@classmethod
|
||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||
"""Substitute XML entities for special XML characters.
|
||
|
||
:param value: A string to be substituted. The less-than sign
|
||
will become <, the greater-than sign will become >,
|
||
and any ampersands will become &. If you want ampersands
|
||
that appear to be part of an entity definition to be left
|
||
alone, use substitute_xml_containing_entities() instead.
|
||
|
||
:param make_quoted_attribute: If True, then the string will be
|
||
quoted, as befits an attribute value.
|
||
"""
|
||
# Escape angle brackets and ampersands.
|
||
value = cls.AMPERSAND_OR_BRACKET.sub(
|
||
cls._substitute_xml_entity, value)
|
||
|
||
if make_quoted_attribute:
|
||
value = cls.quoted_attribute_value(value)
|
||
return value
|
||
|
||
@classmethod
|
||
def substitute_xml_containing_entities(
|
||
cls, value, make_quoted_attribute=False):
|
||
"""Substitute XML entities for special XML characters.
|
||
|
||
:param value: A string to be substituted. The less-than sign will
|
||
become <, the greater-than sign will become >, and any
|
||
ampersands that are not part of an entity defition will
|
||
become &.
|
||
|
||
:param make_quoted_attribute: If True, then the string will be
|
||
quoted, as befits an attribute value.
|
||
"""
|
||
# Escape angle brackets, and ampersands that aren't part of
|
||
# entities.
|
||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
||
cls._substitute_xml_entity, value)
|
||
|
||
if make_quoted_attribute:
|
||
value = cls.quoted_attribute_value(value)
|
||
return value
|
||
|
||
@classmethod
|
||
def substitute_html(cls, s):
|
||
"""Replace certain Unicode characters with named HTML entities.
|
||
|
||
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
||
in that the goal is to make the result more readable (to those
|
||
with ASCII displays) rather than to recover from
|
||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||
character with "é" will make it more readable to some
|
||
people.
|
||
|
||
:param s: A Unicode string.
|
||
"""
|
||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||
cls._substitute_html_entity, s)
|
||
|
||
|
||
class EncodingDetector:
|
||
"""Suggests a number of possible encodings for a bytestring.
|
||
|
||
Order of precedence:
|
||
|
||
1. Encodings you specifically tell EncodingDetector to try first
|
||
(the known_definite_encodings argument to the constructor).
|
||
|
||
2. An encoding determined by sniffing the document's byte-order mark.
|
||
|
||
3. Encodings you specifically tell EncodingDetector to try if
|
||
byte-order mark sniffing fails (the user_encodings argument to the
|
||
constructor).
|
||
|
||
4. An encoding declared within the bytestring itself, either in an
|
||
XML declaration (if the bytestring is to be interpreted as an XML
|
||
document), or in a <meta> tag (if the bytestring is to be
|
||
interpreted as an HTML document.)
|
||
|
||
5. An encoding detected through textual analysis by chardet,
|
||
cchardet, or a similar external library.
|
||
|
||
4. UTF-8.
|
||
|
||
5. Windows-1252.
|
||
|
||
"""
|
||
def __init__(self, markup, known_definite_encodings=None,
|
||
is_html=False, exclude_encodings=None,
|
||
user_encodings=None, override_encodings=None):
|
||
"""Constructor.
|
||
|
||
:param markup: Some markup in an unknown encoding.
|
||
|
||
:param known_definite_encodings: When determining the encoding
|
||
of `markup`, these encodings will be tried first, in
|
||
order. In HTML terms, this corresponds to the "known
|
||
definite encoding" step defined here:
|
||
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
|
||
|
||
:param user_encodings: These encodings will be tried after the
|
||
`known_definite_encodings` have been tried and failed, and
|
||
after an attempt to sniff the encoding by looking at a
|
||
byte order mark has failed. In HTML terms, this
|
||
corresponds to the step "user has explicitly instructed
|
||
the user agent to override the document's character
|
||
encoding", defined here:
|
||
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||
|
||
:param override_encodings: A deprecated alias for
|
||
known_definite_encodings. Any encodings here will be tried
|
||
immediately after the encodings in
|
||
known_definite_encodings.
|
||
|
||
:param is_html: If True, this markup is considered to be
|
||
HTML. Otherwise it's assumed to be XML.
|
||
|
||
:param exclude_encodings: These encodings will not be tried,
|
||
even if they otherwise would be.
|
||
|
||
"""
|
||
self.known_definite_encodings = list(known_definite_encodings or [])
|
||
if override_encodings:
|
||
self.known_definite_encodings += override_encodings
|
||
self.user_encodings = user_encodings or []
|
||
exclude_encodings = exclude_encodings or []
|
||
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||
self.chardet_encoding = None
|
||
self.is_html = is_html
|
||
self.declared_encoding = None
|
||
|
||
# First order of business: strip a byte-order mark.
|
||
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
||
|
||
def _usable(self, encoding, tried):
|
||
"""Should we even bother to try this encoding?
|
||
|
||
:param encoding: Name of an encoding.
|
||
:param tried: Encodings that have already been tried. This will be modified
|
||
as a side effect.
|
||
"""
|
||
if encoding is not None:
|
||
encoding = encoding.lower()
|
||
if encoding in self.exclude_encodings:
|
||
return False
|
||
if encoding not in tried:
|
||
tried.add(encoding)
|
||
return True
|
||
return False
|
||
|
||
@property
|
||
def encodings(self):
|
||
"""Yield a number of encodings that might work for this markup.
|
||
|
||
:yield: A sequence of strings.
|
||
"""
|
||
tried = set()
|
||
|
||
# First, try the known definite encodings
|
||
for e in self.known_definite_encodings:
|
||
if self._usable(e, tried):
|
||
yield e
|
||
|
||
# Did the document originally start with a byte-order mark
|
||
# that indicated its encoding?
|
||
if self._usable(self.sniffed_encoding, tried):
|
||
yield self.sniffed_encoding
|
||
|
||
# Sniffing the byte-order mark did nothing; try the user
|
||
# encodings.
|
||
for e in self.user_encodings:
|
||
if self._usable(e, tried):
|
||
yield e
|
||
|
||
# Look within the document for an XML or HTML encoding
|
||
# declaration.
|
||
if self.declared_encoding is None:
|
||
self.declared_encoding = self.find_declared_encoding(
|
||
self.markup, self.is_html)
|
||
if self._usable(self.declared_encoding, tried):
|
||
yield self.declared_encoding
|
||
|
||
# Use third-party character set detection to guess at the
|
||
# encoding.
|
||
if self.chardet_encoding is None:
|
||
self.chardet_encoding = chardet_dammit(self.markup)
|
||
if self._usable(self.chardet_encoding, tried):
|
||
yield self.chardet_encoding
|
||
|
||
# As a last-ditch effort, try utf-8 and windows-1252.
|
||
for e in ('utf-8', 'windows-1252'):
|
||
if self._usable(e, tried):
|
||
yield e
|
||
|
||
@classmethod
|
||
def strip_byte_order_mark(cls, data):
|
||
"""If a byte-order mark is present, strip it and return the encoding it implies.
|
||
|
||
:param data: Some markup.
|
||
:return: A 2-tuple (modified data, implied encoding)
|
||
"""
|
||
encoding = None
|
||
if isinstance(data, str):
|
||
# Unicode data cannot have a byte-order mark.
|
||
return data, encoding
|
||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||
and (data[2:4] != '\x00\x00'):
|
||
encoding = 'utf-16be'
|
||
data = data[2:]
|
||
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
|
||
and (data[2:4] != '\x00\x00'):
|
||
encoding = 'utf-16le'
|
||
data = data[2:]
|
||
elif data[:3] == b'\xef\xbb\xbf':
|
||
encoding = 'utf-8'
|
||
data = data[3:]
|
||
elif data[:4] == b'\x00\x00\xfe\xff':
|
||
encoding = 'utf-32be'
|
||
data = data[4:]
|
||
elif data[:4] == b'\xff\xfe\x00\x00':
|
||
encoding = 'utf-32le'
|
||
data = data[4:]
|
||
return data, encoding
|
||
|
||
@classmethod
|
||
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
|
||
"""Given a document, tries to find its declared encoding.
|
||
|
||
An XML encoding is declared at the beginning of the document.
|
||
|
||
An HTML encoding is declared in a <meta> tag, hopefully near the
|
||
beginning of the document.
|
||
|
||
:param markup: Some markup.
|
||
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
||
it's assumed to be XML.
|
||
:param search_entire_document: Since an encoding is supposed to declared near the beginning
|
||
of the document, most of the time it's only necessary to search a few kilobytes of data.
|
||
Set this to True to force this method to search the entire document.
|
||
"""
|
||
if search_entire_document:
|
||
xml_endpos = html_endpos = len(markup)
|
||
else:
|
||
xml_endpos = 1024
|
||
html_endpos = max(2048, int(len(markup) * 0.05))
|
||
|
||
if isinstance(markup, bytes):
|
||
res = encoding_res[bytes]
|
||
else:
|
||
res = encoding_res[str]
|
||
|
||
xml_re = res['xml']
|
||
html_re = res['html']
|
||
declared_encoding = None
|
||
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
|
||
if not declared_encoding_match and is_html:
|
||
declared_encoding_match = html_re.search(markup, endpos=html_endpos)
|
||
if declared_encoding_match is not None:
|
||
declared_encoding = declared_encoding_match.groups()[0]
|
||
if declared_encoding:
|
||
if isinstance(declared_encoding, bytes):
|
||
declared_encoding = declared_encoding.decode('ascii', 'replace')
|
||
return declared_encoding.lower()
|
||
return None
|
||
|
||
class UnicodeDammit:
|
||
"""A class for detecting the encoding of a *ML document and
|
||
converting it to a Unicode string. If the source encoding is
|
||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||
equivalents."""
|
||
|
||
# This dictionary maps commonly seen values for "charset" in HTML
|
||
# meta tags to the corresponding Python codec names. It only covers
|
||
# values that aren't in Python's aliases and can't be determined
|
||
# by the heuristics in find_codec.
|
||
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
||
"x-sjis": "shift-jis"}
|
||
|
||
ENCODINGS_WITH_SMART_QUOTES = [
|
||
"windows-1252",
|
||
"iso-8859-1",
|
||
"iso-8859-2",
|
||
]
|
||
|
||
def __init__(self, markup, known_definite_encodings=[],
|
||
smart_quotes_to=None, is_html=False, exclude_encodings=[],
|
||
user_encodings=None, override_encodings=None
|
||
):
|
||
"""Constructor.
|
||
|
||
:param markup: A bytestring representing markup in an unknown encoding.
|
||
|
||
:param known_definite_encodings: When determining the encoding
|
||
of `markup`, these encodings will be tried first, in
|
||
order. In HTML terms, this corresponds to the "known
|
||
definite encoding" step defined here:
|
||
https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
|
||
|
||
:param user_encodings: These encodings will be tried after the
|
||
`known_definite_encodings` have been tried and failed, and
|
||
after an attempt to sniff the encoding by looking at a
|
||
byte order mark has failed. In HTML terms, this
|
||
corresponds to the step "user has explicitly instructed
|
||
the user agent to override the document's character
|
||
encoding", defined here:
|
||
https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||
|
||
:param override_encodings: A deprecated alias for
|
||
known_definite_encodings. Any encodings here will be tried
|
||
immediately after the encodings in
|
||
known_definite_encodings.
|
||
|
||
:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
|
||
to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
|
||
Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
|
||
will convert them to HTML entity references.
|
||
:param is_html: If True, this markup is considered to be HTML. Otherwise
|
||
it's assumed to be XML.
|
||
:param exclude_encodings: These encodings will not be considered, even
|
||
if the sniffing code thinks they might make sense.
|
||
|
||
"""
|
||
self.smart_quotes_to = smart_quotes_to
|
||
self.tried_encodings = []
|
||
self.contains_replacement_characters = False
|
||
self.is_html = is_html
|
||
self.log = logging.getLogger(__name__)
|
||
self.detector = EncodingDetector(
|
||
markup, known_definite_encodings, is_html, exclude_encodings,
|
||
user_encodings, override_encodings
|
||
)
|
||
|
||
# Short-circuit if the data is in Unicode to begin with.
|
||
if isinstance(markup, str) or markup == '':
|
||
self.markup = markup
|
||
self.unicode_markup = str(markup)
|
||
self.original_encoding = None
|
||
return
|
||
|
||
# The encoding detector may have stripped a byte-order mark.
|
||
# Use the stripped markup from this point on.
|
||
self.markup = self.detector.markup
|
||
|
||
u = None
|
||
for encoding in self.detector.encodings:
|
||
markup = self.detector.markup
|
||
u = self._convert_from(encoding)
|
||
if u is not None:
|
||
break
|
||
|
||
if not u:
|
||
# None of the encodings worked. As an absolute last resort,
|
||
# try them again with character replacement.
|
||
|
||
for encoding in self.detector.encodings:
|
||
if encoding != "ascii":
|
||
u = self._convert_from(encoding, "replace")
|
||
if u is not None:
|
||
self.log.warning(
|
||
"Some characters could not be decoded, and were "
|
||
"replaced with REPLACEMENT CHARACTER."
|
||
)
|
||
self.contains_replacement_characters = True
|
||
break
|
||
|
||
# If none of that worked, we could at this point force it to
|
||
# ASCII, but that would destroy so much data that I think
|
||
# giving up is better.
|
||
self.unicode_markup = u
|
||
if not u:
|
||
self.original_encoding = None
|
||
|
||
def _sub_ms_char(self, match):
|
||
"""Changes a MS smart quote character to an XML or HTML
|
||
entity, or an ASCII character."""
|
||
orig = match.group(1)
|
||
if self.smart_quotes_to == 'ascii':
|
||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
||
else:
|
||
sub = self.MS_CHARS.get(orig)
|
||
if type(sub) == tuple:
|
||
if self.smart_quotes_to == 'xml':
|
||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
||
else:
|
||
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
||
else:
|
||
sub = sub.encode()
|
||
return sub
|
||
|
||
def _convert_from(self, proposed, errors="strict"):
|
||
"""Attempt to convert the markup to the proposed encoding.
|
||
|
||
:param proposed: The name of a character encoding.
|
||
"""
|
||
proposed = self.find_codec(proposed)
|
||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||
return None
|
||
self.tried_encodings.append((proposed, errors))
|
||
markup = self.markup
|
||
# Convert smart quotes to HTML if coming from an encoding
|
||
# that might have them.
|
||
if (self.smart_quotes_to is not None
|
||
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
|
||
smart_quotes_re = b"([\x80-\x9f])"
|
||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||
|
||
try:
|
||
#print("Trying to convert document to %s (errors=%s)" % (
|
||
# proposed, errors))
|
||
u = self._to_unicode(markup, proposed, errors)
|
||
self.markup = u
|
||
self.original_encoding = proposed
|
||
except Exception as e:
|
||
#print("That didn't work!")
|
||
#print(e)
|
||
return None
|
||
#print("Correct encoding: %s" % proposed)
|
||
return self.markup
|
||
|
||
def _to_unicode(self, data, encoding, errors="strict"):
|
||
"""Given a string and its encoding, decodes the string into Unicode.
|
||
|
||
:param encoding: The name of an encoding.
|
||
"""
|
||
return str(data, encoding, errors)
|
||
|
||
@property
|
||
def declared_html_encoding(self):
|
||
"""If the markup is an HTML document, returns the encoding declared _within_
|
||
the document.
|
||
"""
|
||
if not self.is_html:
|
||
return None
|
||
return self.detector.declared_encoding
|
||
|
||
def find_codec(self, charset):
|
||
"""Convert the name of a character set to a codec name.
|
||
|
||
:param charset: The name of a character set.
|
||
:return: The name of a codec.
|
||
"""
|
||
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
||
or (charset and self._codec(charset.replace("-", "")))
|
||
or (charset and self._codec(charset.replace("-", "_")))
|
||
or (charset and charset.lower())
|
||
or charset
|
||
)
|
||
if value:
|
||
return value.lower()
|
||
return None
|
||
|
||
def _codec(self, charset):
|
||
if not charset:
|
||
return charset
|
||
codec = None
|
||
try:
|
||
codecs.lookup(charset)
|
||
codec = charset
|
||
except (LookupError, ValueError):
|
||
pass
|
||
return codec
|
||
|
||
|
||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
||
b'\x81': ' ',
|
||
b'\x82': ('sbquo', '201A'),
|
||
b'\x83': ('fnof', '192'),
|
||
b'\x84': ('bdquo', '201E'),
|
||
b'\x85': ('hellip', '2026'),
|
||
b'\x86': ('dagger', '2020'),
|
||
b'\x87': ('Dagger', '2021'),
|
||
b'\x88': ('circ', '2C6'),
|
||
b'\x89': ('permil', '2030'),
|
||
b'\x8A': ('Scaron', '160'),
|
||
b'\x8B': ('lsaquo', '2039'),
|
||
b'\x8C': ('OElig', '152'),
|
||
b'\x8D': '?',
|
||
b'\x8E': ('#x17D', '17D'),
|
||
b'\x8F': '?',
|
||
b'\x90': '?',
|
||
b'\x91': ('lsquo', '2018'),
|
||
b'\x92': ('rsquo', '2019'),
|
||
b'\x93': ('ldquo', '201C'),
|
||
b'\x94': ('rdquo', '201D'),
|
||
b'\x95': ('bull', '2022'),
|
||
b'\x96': ('ndash', '2013'),
|
||
b'\x97': ('mdash', '2014'),
|
||
b'\x98': ('tilde', '2DC'),
|
||
b'\x99': ('trade', '2122'),
|
||
b'\x9a': ('scaron', '161'),
|
||
b'\x9b': ('rsaquo', '203A'),
|
||
b'\x9c': ('oelig', '153'),
|
||
b'\x9d': '?',
|
||
b'\x9e': ('#x17E', '17E'),
|
||
b'\x9f': ('Yuml', ''),}
|
||
|
||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||
# horrors like stripping diacritical marks to turn á into a, but also
|
||
# contains non-horrors like turning “ into ".
|
||
MS_CHARS_TO_ASCII = {
|
||
b'\x80' : 'EUR',
|
||
b'\x81' : ' ',
|
||
b'\x82' : ',',
|
||
b'\x83' : 'f',
|
||
b'\x84' : ',,',
|
||
b'\x85' : '...',
|
||
b'\x86' : '+',
|
||
b'\x87' : '++',
|
||
b'\x88' : '^',
|
||
b'\x89' : '%',
|
||
b'\x8a' : 'S',
|
||
b'\x8b' : '<',
|
||
b'\x8c' : 'OE',
|
||
b'\x8d' : '?',
|
||
b'\x8e' : 'Z',
|
||
b'\x8f' : '?',
|
||
b'\x90' : '?',
|
||
b'\x91' : "'",
|
||
b'\x92' : "'",
|
||
b'\x93' : '"',
|
||
b'\x94' : '"',
|
||
b'\x95' : '*',
|
||
b'\x96' : '-',
|
||
b'\x97' : '--',
|
||
b'\x98' : '~',
|
||
b'\x99' : '(TM)',
|
||
b'\x9a' : 's',
|
||
b'\x9b' : '>',
|
||
b'\x9c' : 'oe',
|
||
b'\x9d' : '?',
|
||
b'\x9e' : 'z',
|
||
b'\x9f' : 'Y',
|
||
b'\xa0' : ' ',
|
||
b'\xa1' : '!',
|
||
b'\xa2' : 'c',
|
||
b'\xa3' : 'GBP',
|
||
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
||
#generic currency symbol.
|
||
b'\xa5' : 'YEN',
|
||
b'\xa6' : '|',
|
||
b'\xa7' : 'S',
|
||
b'\xa8' : '..',
|
||
b'\xa9' : '',
|
||
b'\xaa' : '(th)',
|
||
b'\xab' : '<<',
|
||
b'\xac' : '!',
|
||
b'\xad' : ' ',
|
||
b'\xae' : '(R)',
|
||
b'\xaf' : '-',
|
||
b'\xb0' : 'o',
|
||
b'\xb1' : '+-',
|
||
b'\xb2' : '2',
|
||
b'\xb3' : '3',
|
||
b'\xb4' : ("'", 'acute'),
|
||
b'\xb5' : 'u',
|
||
b'\xb6' : 'P',
|
||
b'\xb7' : '*',
|
||
b'\xb8' : ',',
|
||
b'\xb9' : '1',
|
||
b'\xba' : '(th)',
|
||
b'\xbb' : '>>',
|
||
b'\xbc' : '1/4',
|
||
b'\xbd' : '1/2',
|
||
b'\xbe' : '3/4',
|
||
b'\xbf' : '?',
|
||
b'\xc0' : 'A',
|
||
b'\xc1' : 'A',
|
||
b'\xc2' : 'A',
|
||
b'\xc3' : 'A',
|
||
b'\xc4' : 'A',
|
||
b'\xc5' : 'A',
|
||
b'\xc6' : 'AE',
|
||
b'\xc7' : 'C',
|
||
b'\xc8' : 'E',
|
||
b'\xc9' : 'E',
|
||
b'\xca' : 'E',
|
||
b'\xcb' : 'E',
|
||
b'\xcc' : 'I',
|
||
b'\xcd' : 'I',
|
||
b'\xce' : 'I',
|
||
b'\xcf' : 'I',
|
||
b'\xd0' : 'D',
|
||
b'\xd1' : 'N',
|
||
b'\xd2' : 'O',
|
||
b'\xd3' : 'O',
|
||
b'\xd4' : 'O',
|
||
b'\xd5' : 'O',
|
||
b'\xd6' : 'O',
|
||
b'\xd7' : '*',
|
||
b'\xd8' : 'O',
|
||
b'\xd9' : 'U',
|
||
b'\xda' : 'U',
|
||
b'\xdb' : 'U',
|
||
b'\xdc' : 'U',
|
||
b'\xdd' : 'Y',
|
||
b'\xde' : 'b',
|
||
b'\xdf' : 'B',
|
||
b'\xe0' : 'a',
|
||
b'\xe1' : 'a',
|
||
b'\xe2' : 'a',
|
||
b'\xe3' : 'a',
|
||
b'\xe4' : 'a',
|
||
b'\xe5' : 'a',
|
||
b'\xe6' : 'ae',
|
||
b'\xe7' : 'c',
|
||
b'\xe8' : 'e',
|
||
b'\xe9' : 'e',
|
||
b'\xea' : 'e',
|
||
b'\xeb' : 'e',
|
||
b'\xec' : 'i',
|
||
b'\xed' : 'i',
|
||
b'\xee' : 'i',
|
||
b'\xef' : 'i',
|
||
b'\xf0' : 'o',
|
||
b'\xf1' : 'n',
|
||
b'\xf2' : 'o',
|
||
b'\xf3' : 'o',
|
||
b'\xf4' : 'o',
|
||
b'\xf5' : 'o',
|
||
b'\xf6' : 'o',
|
||
b'\xf7' : '/',
|
||
b'\xf8' : 'o',
|
||
b'\xf9' : 'u',
|
||
b'\xfa' : 'u',
|
||
b'\xfb' : 'u',
|
||
b'\xfc' : 'u',
|
||
b'\xfd' : 'y',
|
||
b'\xfe' : 'b',
|
||
b'\xff' : 'y',
|
||
}
|
||
|
||
# A map used when removing rogue Windows-1252/ISO-8859-1
|
||
# characters in otherwise UTF-8 documents.
|
||
#
|
||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
||
# Windows-1252.
|
||
WINDOWS_1252_TO_UTF8 = {
|
||
0x80 : b'\xe2\x82\xac', # €
|
||
0x82 : b'\xe2\x80\x9a', # ‚
|
||
0x83 : b'\xc6\x92', # ƒ
|
||
0x84 : b'\xe2\x80\x9e', # „
|
||
0x85 : b'\xe2\x80\xa6', # …
|
||
0x86 : b'\xe2\x80\xa0', # †
|
||
0x87 : b'\xe2\x80\xa1', # ‡
|
||
0x88 : b'\xcb\x86', # ˆ
|
||
0x89 : b'\xe2\x80\xb0', # ‰
|
||
0x8a : b'\xc5\xa0', # Š
|
||
0x8b : b'\xe2\x80\xb9', # ‹
|
||
0x8c : b'\xc5\x92', # Œ
|
||
0x8e : b'\xc5\xbd', # Ž
|
||
0x91 : b'\xe2\x80\x98', # ‘
|
||
0x92 : b'\xe2\x80\x99', # ’
|
||
0x93 : b'\xe2\x80\x9c', # “
|
||
0x94 : b'\xe2\x80\x9d', # ”
|
||
0x95 : b'\xe2\x80\xa2', # •
|
||
0x96 : b'\xe2\x80\x93', # –
|
||
0x97 : b'\xe2\x80\x94', # —
|
||
0x98 : b'\xcb\x9c', # ˜
|
||
0x99 : b'\xe2\x84\xa2', # ™
|
||
0x9a : b'\xc5\xa1', # š
|
||
0x9b : b'\xe2\x80\xba', # ›
|
||
0x9c : b'\xc5\x93', # œ
|
||
0x9e : b'\xc5\xbe', # ž
|
||
0x9f : b'\xc5\xb8', # Ÿ
|
||
0xa0 : b'\xc2\xa0', #
|
||
0xa1 : b'\xc2\xa1', # ¡
|
||
0xa2 : b'\xc2\xa2', # ¢
|
||
0xa3 : b'\xc2\xa3', # £
|
||
0xa4 : b'\xc2\xa4', # ¤
|
||
0xa5 : b'\xc2\xa5', # ¥
|
||
0xa6 : b'\xc2\xa6', # ¦
|
||
0xa7 : b'\xc2\xa7', # §
|
||
0xa8 : b'\xc2\xa8', # ¨
|
||
0xa9 : b'\xc2\xa9', # ©
|
||
0xaa : b'\xc2\xaa', # ª
|
||
0xab : b'\xc2\xab', # «
|
||
0xac : b'\xc2\xac', # ¬
|
||
0xad : b'\xc2\xad', #
|
||
0xae : b'\xc2\xae', # ®
|
||
0xaf : b'\xc2\xaf', # ¯
|
||
0xb0 : b'\xc2\xb0', # °
|
||
0xb1 : b'\xc2\xb1', # ±
|
||
0xb2 : b'\xc2\xb2', # ²
|
||
0xb3 : b'\xc2\xb3', # ³
|
||
0xb4 : b'\xc2\xb4', # ´
|
||
0xb5 : b'\xc2\xb5', # µ
|
||
0xb6 : b'\xc2\xb6', # ¶
|
||
0xb7 : b'\xc2\xb7', # ·
|
||
0xb8 : b'\xc2\xb8', # ¸
|
||
0xb9 : b'\xc2\xb9', # ¹
|
||
0xba : b'\xc2\xba', # º
|
||
0xbb : b'\xc2\xbb', # »
|
||
0xbc : b'\xc2\xbc', # ¼
|
||
0xbd : b'\xc2\xbd', # ½
|
||
0xbe : b'\xc2\xbe', # ¾
|
||
0xbf : b'\xc2\xbf', # ¿
|
||
0xc0 : b'\xc3\x80', # À
|
||
0xc1 : b'\xc3\x81', # Á
|
||
0xc2 : b'\xc3\x82', # Â
|
||
0xc3 : b'\xc3\x83', # Ã
|
||
0xc4 : b'\xc3\x84', # Ä
|
||
0xc5 : b'\xc3\x85', # Å
|
||
0xc6 : b'\xc3\x86', # Æ
|
||
0xc7 : b'\xc3\x87', # Ç
|
||
0xc8 : b'\xc3\x88', # È
|
||
0xc9 : b'\xc3\x89', # É
|
||
0xca : b'\xc3\x8a', # Ê
|
||
0xcb : b'\xc3\x8b', # Ë
|
||
0xcc : b'\xc3\x8c', # Ì
|
||
0xcd : b'\xc3\x8d', # Í
|
||
0xce : b'\xc3\x8e', # Î
|
||
0xcf : b'\xc3\x8f', # Ï
|
||
0xd0 : b'\xc3\x90', # Ð
|
||
0xd1 : b'\xc3\x91', # Ñ
|
||
0xd2 : b'\xc3\x92', # Ò
|
||
0xd3 : b'\xc3\x93', # Ó
|
||
0xd4 : b'\xc3\x94', # Ô
|
||
0xd5 : b'\xc3\x95', # Õ
|
||
0xd6 : b'\xc3\x96', # Ö
|
||
0xd7 : b'\xc3\x97', # ×
|
||
0xd8 : b'\xc3\x98', # Ø
|
||
0xd9 : b'\xc3\x99', # Ù
|
||
0xda : b'\xc3\x9a', # Ú
|
||
0xdb : b'\xc3\x9b', # Û
|
||
0xdc : b'\xc3\x9c', # Ü
|
||
0xdd : b'\xc3\x9d', # Ý
|
||
0xde : b'\xc3\x9e', # Þ
|
||
0xdf : b'\xc3\x9f', # ß
|
||
0xe0 : b'\xc3\xa0', # à
|
||
0xe1 : b'\xa1', # á
|
||
0xe2 : b'\xc3\xa2', # â
|
||
0xe3 : b'\xc3\xa3', # ã
|
||
0xe4 : b'\xc3\xa4', # ä
|
||
0xe5 : b'\xc3\xa5', # å
|
||
0xe6 : b'\xc3\xa6', # æ
|
||
0xe7 : b'\xc3\xa7', # ç
|
||
0xe8 : b'\xc3\xa8', # è
|
||
0xe9 : b'\xc3\xa9', # é
|
||
0xea : b'\xc3\xaa', # ê
|
||
0xeb : b'\xc3\xab', # ë
|
||
0xec : b'\xc3\xac', # ì
|
||
0xed : b'\xc3\xad', # í
|
||
0xee : b'\xc3\xae', # î
|
||
0xef : b'\xc3\xaf', # ï
|
||
0xf0 : b'\xc3\xb0', # ð
|
||
0xf1 : b'\xc3\xb1', # ñ
|
||
0xf2 : b'\xc3\xb2', # ò
|
||
0xf3 : b'\xc3\xb3', # ó
|
||
0xf4 : b'\xc3\xb4', # ô
|
||
0xf5 : b'\xc3\xb5', # õ
|
||
0xf6 : b'\xc3\xb6', # ö
|
||
0xf7 : b'\xc3\xb7', # ÷
|
||
0xf8 : b'\xc3\xb8', # ø
|
||
0xf9 : b'\xc3\xb9', # ù
|
||
0xfa : b'\xc3\xba', # ú
|
||
0xfb : b'\xc3\xbb', # û
|
||
0xfc : b'\xc3\xbc', # ü
|
||
0xfd : b'\xc3\xbd', # ý
|
||
0xfe : b'\xc3\xbe', # þ
|
||
}
|
||
|
||
MULTIBYTE_MARKERS_AND_SIZES = [
|
||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
||
]
|
||
|
||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||
|
||
@classmethod
|
||
def detwingle(cls, in_bytes, main_encoding="utf8",
|
||
embedded_encoding="windows-1252"):
|
||
"""Fix characters from one encoding embedded in some other encoding.
|
||
|
||
Currently the only situation supported is Windows-1252 (or its
|
||
subset ISO-8859-1), embedded in UTF-8.
|
||
|
||
:param in_bytes: A bytestring that you suspect contains
|
||
characters from multiple encodings. Note that this _must_
|
||
be a bytestring. If you've already converted the document
|
||
to Unicode, you're too late.
|
||
:param main_encoding: The primary encoding of `in_bytes`.
|
||
:param embedded_encoding: The encoding that was used to embed characters
|
||
in the main document.
|
||
:return: A bytestring in which `embedded_encoding`
|
||
characters have been converted to their `main_encoding`
|
||
equivalents.
|
||
"""
|
||
if embedded_encoding.replace('_', '-').lower() not in (
|
||
'windows-1252', 'windows_1252'):
|
||
raise NotImplementedError(
|
||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||
"embedded encodings.")
|
||
|
||
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
||
raise NotImplementedError(
|
||
"UTF-8 is the only currently supported main encoding.")
|
||
|
||
byte_chunks = []
|
||
|
||
chunk_start = 0
|
||
pos = 0
|
||
while pos < len(in_bytes):
|
||
byte = in_bytes[pos]
|
||
if not isinstance(byte, int):
|
||
# Python 2.x
|
||
byte = ord(byte)
|
||
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
||
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
||
# This is the start of a UTF-8 multibyte character. Skip
|
||
# to the end.
|
||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||
if byte >= start and byte <= end:
|
||
pos += size
|
||
break
|
||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||
# We found a Windows-1252 character!
|
||
# Save the string up to this point as a chunk.
|
||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||
|
||
# Now translate the Windows-1252 character into UTF-8
|
||
# and add it as another, one-byte chunk.
|
||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||
pos += 1
|
||
chunk_start = pos
|
||
else:
|
||
# Go on to the next character.
|
||
pos += 1
|
||
if chunk_start == 0:
|
||
# The string is unchanged.
|
||
return in_bytes
|
||
else:
|
||
# Store the final chunk.
|
||
byte_chunks.append(in_bytes[chunk_start:])
|
||
return b''.join(byte_chunks)
|
||
|