885 lines
30 KiB
Python
885 lines
30 KiB
Python
|
# cython: language_level=3
|
||
|
|
||
|
from __future__ import absolute_import
|
||
|
|
||
|
import difflib
|
||
|
from lxml import etree
|
||
|
from lxml.html import fragment_fromstring
|
||
|
import re
|
||
|
|
||
|
__all__ = ['html_annotate', 'htmldiff']
|
||
|
|
||
|
try:
|
||
|
from html import escape as html_escape
|
||
|
except ImportError:
|
||
|
from cgi import escape as html_escape
|
||
|
try:
|
||
|
_unicode = unicode
|
||
|
except NameError:
|
||
|
# Python 3
|
||
|
_unicode = str
|
||
|
try:
|
||
|
basestring
|
||
|
except NameError:
|
||
|
# Python 3
|
||
|
basestring = str
|
||
|
|
||
|
############################################################
|
||
|
## Annotation
|
||
|
############################################################
|
||
|
|
||
|
def default_markup(text, version):
|
||
|
return '<span title="%s">%s</span>' % (
|
||
|
html_escape(_unicode(version), 1), text)
|
||
|
|
||
|
def html_annotate(doclist, markup=default_markup):
|
||
|
"""
|
||
|
doclist should be ordered from oldest to newest, like::
|
||
|
|
||
|
>>> version1 = 'Hello World'
|
||
|
>>> version2 = 'Goodbye World'
|
||
|
>>> print(html_annotate([(version1, 'version 1'),
|
||
|
... (version2, 'version 2')]))
|
||
|
<span title="version 2">Goodbye</span> <span title="version 1">World</span>
|
||
|
|
||
|
The documents must be *fragments* (str/UTF8 or unicode), not
|
||
|
complete documents
|
||
|
|
||
|
The markup argument is a function to markup the spans of words.
|
||
|
This function is called like markup('Hello', 'version 2'), and
|
||
|
returns HTML. The first argument is text and never includes any
|
||
|
markup. The default uses a span with a title:
|
||
|
|
||
|
>>> print(default_markup('Some Text', 'by Joe'))
|
||
|
<span title="by Joe">Some Text</span>
|
||
|
"""
|
||
|
# The basic strategy we have is to split the documents up into
|
||
|
# logical tokens (which are words with attached markup). We then
|
||
|
# do diffs of each of the versions to track when a token first
|
||
|
# appeared in the document; the annotation attached to the token
|
||
|
# is the version where it first appeared.
|
||
|
tokenlist = [tokenize_annotated(doc, version)
|
||
|
for doc, version in doclist]
|
||
|
cur_tokens = tokenlist[0]
|
||
|
for tokens in tokenlist[1:]:
|
||
|
html_annotate_merge_annotations(cur_tokens, tokens)
|
||
|
cur_tokens = tokens
|
||
|
|
||
|
# After we've tracked all the tokens, we can combine spans of text
|
||
|
# that are adjacent and have the same annotation
|
||
|
cur_tokens = compress_tokens(cur_tokens)
|
||
|
# And finally add markup
|
||
|
result = markup_serialize_tokens(cur_tokens, markup)
|
||
|
return ''.join(result).strip()
|
||
|
|
||
|
def tokenize_annotated(doc, annotation):
|
||
|
"""Tokenize a document and add an annotation attribute to each token
|
||
|
"""
|
||
|
tokens = tokenize(doc, include_hrefs=False)
|
||
|
for tok in tokens:
|
||
|
tok.annotation = annotation
|
||
|
return tokens
|
||
|
|
||
|
def html_annotate_merge_annotations(tokens_old, tokens_new):
|
||
|
"""Merge the annotations from tokens_old into tokens_new, when the
|
||
|
tokens in the new document already existed in the old document.
|
||
|
"""
|
||
|
s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
|
||
|
commands = s.get_opcodes()
|
||
|
|
||
|
for command, i1, i2, j1, j2 in commands:
|
||
|
if command == 'equal':
|
||
|
eq_old = tokens_old[i1:i2]
|
||
|
eq_new = tokens_new[j1:j2]
|
||
|
copy_annotations(eq_old, eq_new)
|
||
|
|
||
|
def copy_annotations(src, dest):
|
||
|
"""
|
||
|
Copy annotations from the tokens listed in src to the tokens in dest
|
||
|
"""
|
||
|
assert len(src) == len(dest)
|
||
|
for src_tok, dest_tok in zip(src, dest):
|
||
|
dest_tok.annotation = src_tok.annotation
|
||
|
|
||
|
def compress_tokens(tokens):
|
||
|
"""
|
||
|
Combine adjacent tokens when there is no HTML between the tokens,
|
||
|
and they share an annotation
|
||
|
"""
|
||
|
result = [tokens[0]]
|
||
|
for tok in tokens[1:]:
|
||
|
if (not result[-1].post_tags and
|
||
|
not tok.pre_tags and
|
||
|
result[-1].annotation == tok.annotation):
|
||
|
compress_merge_back(result, tok)
|
||
|
else:
|
||
|
result.append(tok)
|
||
|
return result
|
||
|
|
||
|
def compress_merge_back(tokens, tok):
|
||
|
""" Merge tok into the last element of tokens (modifying the list of
|
||
|
tokens in-place). """
|
||
|
last = tokens[-1]
|
||
|
if type(last) is not token or type(tok) is not token:
|
||
|
tokens.append(tok)
|
||
|
else:
|
||
|
text = _unicode(last)
|
||
|
if last.trailing_whitespace:
|
||
|
text += last.trailing_whitespace
|
||
|
text += tok
|
||
|
merged = token(text,
|
||
|
pre_tags=last.pre_tags,
|
||
|
post_tags=tok.post_tags,
|
||
|
trailing_whitespace=tok.trailing_whitespace)
|
||
|
merged.annotation = last.annotation
|
||
|
tokens[-1] = merged
|
||
|
|
||
|
def markup_serialize_tokens(tokens, markup_func):
|
||
|
"""
|
||
|
Serialize the list of tokens into a list of text chunks, calling
|
||
|
markup_func around text to add annotations.
|
||
|
"""
|
||
|
for token in tokens:
|
||
|
for pre in token.pre_tags:
|
||
|
yield pre
|
||
|
html = token.html()
|
||
|
html = markup_func(html, token.annotation)
|
||
|
if token.trailing_whitespace:
|
||
|
html += token.trailing_whitespace
|
||
|
yield html
|
||
|
for post in token.post_tags:
|
||
|
yield post
|
||
|
|
||
|
|
||
|
############################################################
|
||
|
## HTML Diffs
|
||
|
############################################################
|
||
|
|
||
|
def htmldiff(old_html, new_html):
|
||
|
## FIXME: this should take parsed documents too, and use their body
|
||
|
## or other content.
|
||
|
""" Do a diff of the old and new document. The documents are HTML
|
||
|
*fragments* (str/UTF8 or unicode), they are not complete documents
|
||
|
(i.e., no <html> tag).
|
||
|
|
||
|
Returns HTML with <ins> and <del> tags added around the
|
||
|
appropriate text.
|
||
|
|
||
|
Markup is generally ignored, with the markup from new_html
|
||
|
preserved, and possibly some markup from old_html (though it is
|
||
|
considered acceptable to lose some of the old markup). Only the
|
||
|
words in the HTML are diffed. The exception is <img> tags, which
|
||
|
are treated like words, and the href attribute of <a> tags, which
|
||
|
are noted inside the tag itself when there are changes.
|
||
|
"""
|
||
|
old_html_tokens = tokenize(old_html)
|
||
|
new_html_tokens = tokenize(new_html)
|
||
|
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
|
||
|
result = ''.join(result).strip()
|
||
|
return fixup_ins_del_tags(result)
|
||
|
|
||
|
def htmldiff_tokens(html1_tokens, html2_tokens):
|
||
|
""" Does a diff on the tokens themselves, returning a list of text
|
||
|
chunks (not tokens).
|
||
|
"""
|
||
|
# There are several passes as we do the differences. The tokens
|
||
|
# isolate the portion of the content we care to diff; difflib does
|
||
|
# all the actual hard work at that point.
|
||
|
#
|
||
|
# Then we must create a valid document from pieces of both the old
|
||
|
# document and the new document. We generally prefer to take
|
||
|
# markup from the new document, and only do a best effort attempt
|
||
|
# to keep markup from the old document; anything that we can't
|
||
|
# resolve we throw away. Also we try to put the deletes as close
|
||
|
# to the location where we think they would have been -- because
|
||
|
# we are only keeping the markup from the new document, it can be
|
||
|
# fuzzy where in the new document the old text would have gone.
|
||
|
# Again we just do a best effort attempt.
|
||
|
s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
|
||
|
commands = s.get_opcodes()
|
||
|
result = []
|
||
|
for command, i1, i2, j1, j2 in commands:
|
||
|
if command == 'equal':
|
||
|
result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
|
||
|
continue
|
||
|
if command == 'insert' or command == 'replace':
|
||
|
ins_tokens = expand_tokens(html2_tokens[j1:j2])
|
||
|
merge_insert(ins_tokens, result)
|
||
|
if command == 'delete' or command == 'replace':
|
||
|
del_tokens = expand_tokens(html1_tokens[i1:i2])
|
||
|
merge_delete(del_tokens, result)
|
||
|
# If deletes were inserted directly as <del> then we'd have an
|
||
|
# invalid document at this point. Instead we put in special
|
||
|
# markers, and when the complete diffed document has been created
|
||
|
# we try to move the deletes around and resolve any problems.
|
||
|
result = cleanup_delete(result)
|
||
|
|
||
|
return result
|
||
|
|
||
|
def expand_tokens(tokens, equal=False):
|
||
|
"""Given a list of tokens, return a generator of the chunks of
|
||
|
text for the data in the tokens.
|
||
|
"""
|
||
|
for token in tokens:
|
||
|
for pre in token.pre_tags:
|
||
|
yield pre
|
||
|
if not equal or not token.hide_when_equal:
|
||
|
if token.trailing_whitespace:
|
||
|
yield token.html() + token.trailing_whitespace
|
||
|
else:
|
||
|
yield token.html()
|
||
|
for post in token.post_tags:
|
||
|
yield post
|
||
|
|
||
|
def merge_insert(ins_chunks, doc):
|
||
|
""" doc is the already-handled document (as a list of text chunks);
|
||
|
here we add <ins>ins_chunks</ins> to the end of that. """
|
||
|
# Though we don't throw away unbalanced_start or unbalanced_end
|
||
|
# (we assume there is accompanying markup later or earlier in the
|
||
|
# document), we only put <ins> around the balanced portion.
|
||
|
unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
|
||
|
doc.extend(unbalanced_start)
|
||
|
if doc and not doc[-1].endswith(' '):
|
||
|
# Fix up the case where the word before the insert didn't end with
|
||
|
# a space
|
||
|
doc[-1] += ' '
|
||
|
doc.append('<ins>')
|
||
|
if balanced and balanced[-1].endswith(' '):
|
||
|
# We move space outside of </ins>
|
||
|
balanced[-1] = balanced[-1][:-1]
|
||
|
doc.extend(balanced)
|
||
|
doc.append('</ins> ')
|
||
|
doc.extend(unbalanced_end)
|
||
|
|
||
|
# These are sentinels to represent the start and end of a <del>
|
||
|
# segment, until we do the cleanup phase to turn them into proper
|
||
|
# markup:
|
||
|
class DEL_START:
|
||
|
pass
|
||
|
class DEL_END:
|
||
|
pass
|
||
|
|
||
|
class NoDeletes(Exception):
|
||
|
""" Raised when the document no longer contains any pending deletes
|
||
|
(DEL_START/DEL_END) """
|
||
|
|
||
|
def merge_delete(del_chunks, doc):
|
||
|
""" Adds the text chunks in del_chunks to the document doc (another
|
||
|
list of text chunks) with marker to show it is a delete.
|
||
|
cleanup_delete later resolves these markers into <del> tags."""
|
||
|
doc.append(DEL_START)
|
||
|
doc.extend(del_chunks)
|
||
|
doc.append(DEL_END)
|
||
|
|
||
|
def cleanup_delete(chunks):
|
||
|
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
|
||
|
them with <del></del>. To do this while keeping the document
|
||
|
valid, it may need to drop some tags (either start or end tags).
|
||
|
|
||
|
It may also move the del into adjacent tags to try to move it to a
|
||
|
similar location where it was originally located (e.g., moving a
|
||
|
delete into preceding <div> tag, if the del looks like (DEL_START,
|
||
|
'Text</div>', DEL_END)"""
|
||
|
while 1:
|
||
|
# Find a pending DEL_START/DEL_END, splitting the document
|
||
|
# into stuff-preceding-DEL_START, stuff-inside, and
|
||
|
# stuff-following-DEL_END
|
||
|
try:
|
||
|
pre_delete, delete, post_delete = split_delete(chunks)
|
||
|
except NoDeletes:
|
||
|
# Nothing found, we've cleaned up the entire doc
|
||
|
break
|
||
|
# The stuff-inside-DEL_START/END may not be well balanced
|
||
|
# markup. First we figure out what unbalanced portions there are:
|
||
|
unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
|
||
|
# Then we move the span forward and/or backward based on these
|
||
|
# unbalanced portions:
|
||
|
locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
|
||
|
locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
|
||
|
doc = pre_delete
|
||
|
if doc and not doc[-1].endswith(' '):
|
||
|
# Fix up case where the word before us didn't have a trailing space
|
||
|
doc[-1] += ' '
|
||
|
doc.append('<del>')
|
||
|
if balanced and balanced[-1].endswith(' '):
|
||
|
# We move space outside of </del>
|
||
|
balanced[-1] = balanced[-1][:-1]
|
||
|
doc.extend(balanced)
|
||
|
doc.append('</del> ')
|
||
|
doc.extend(post_delete)
|
||
|
chunks = doc
|
||
|
return chunks
|
||
|
|
||
|
def split_unbalanced(chunks):
|
||
|
"""Return (unbalanced_start, balanced, unbalanced_end), where each is
|
||
|
a list of text and tag chunks.
|
||
|
|
||
|
unbalanced_start is a list of all the tags that are opened, but
|
||
|
not closed in this span. Similarly, unbalanced_end is a list of
|
||
|
tags that are closed but were not opened. Extracting these might
|
||
|
mean some reordering of the chunks."""
|
||
|
start = []
|
||
|
end = []
|
||
|
tag_stack = []
|
||
|
balanced = []
|
||
|
for chunk in chunks:
|
||
|
if not chunk.startswith('<'):
|
||
|
balanced.append(chunk)
|
||
|
continue
|
||
|
endtag = chunk[1] == '/'
|
||
|
name = chunk.split()[0].strip('<>/')
|
||
|
if name in empty_tags:
|
||
|
balanced.append(chunk)
|
||
|
continue
|
||
|
if endtag:
|
||
|
if tag_stack and tag_stack[-1][0] == name:
|
||
|
balanced.append(chunk)
|
||
|
name, pos, tag = tag_stack.pop()
|
||
|
balanced[pos] = tag
|
||
|
elif tag_stack:
|
||
|
start.extend([tag for name, pos, tag in tag_stack])
|
||
|
tag_stack = []
|
||
|
end.append(chunk)
|
||
|
else:
|
||
|
end.append(chunk)
|
||
|
else:
|
||
|
tag_stack.append((name, len(balanced), chunk))
|
||
|
balanced.append(None)
|
||
|
start.extend(
|
||
|
[chunk for name, pos, chunk in tag_stack])
|
||
|
balanced = [chunk for chunk in balanced if chunk is not None]
|
||
|
return start, balanced, end
|
||
|
|
||
|
def split_delete(chunks):
|
||
|
""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
|
||
|
stuff_after_DEL_END). Returns the first case found (there may be
|
||
|
more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
|
||
|
there's no DEL_START found. """
|
||
|
try:
|
||
|
pos = chunks.index(DEL_START)
|
||
|
except ValueError:
|
||
|
raise NoDeletes
|
||
|
pos2 = chunks.index(DEL_END)
|
||
|
return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
|
||
|
|
||
|
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
|
||
|
""" pre_delete and post_delete implicitly point to a place in the
|
||
|
document (where the two were split). This moves that point (by
|
||
|
popping items from one and pushing them onto the other). It moves
|
||
|
the point to try to find a place where unbalanced_start applies.
|
||
|
|
||
|
As an example::
|
||
|
|
||
|
>>> unbalanced_start = ['<div>']
|
||
|
>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
|
||
|
>>> pre, post = doc[:3], doc[3:]
|
||
|
>>> pre, post
|
||
|
(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
|
||
|
>>> locate_unbalanced_start(unbalanced_start, pre, post)
|
||
|
>>> pre, post
|
||
|
(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
|
||
|
|
||
|
As you can see, we moved the point so that the dangling <div> that
|
||
|
we found will be effectively replaced by the div in the original
|
||
|
document. If this doesn't work out, we just throw away
|
||
|
unbalanced_start without doing anything.
|
||
|
"""
|
||
|
while 1:
|
||
|
if not unbalanced_start:
|
||
|
# We have totally succeeded in finding the position
|
||
|
break
|
||
|
finding = unbalanced_start[0]
|
||
|
finding_name = finding.split()[0].strip('<>')
|
||
|
if not post_delete:
|
||
|
break
|
||
|
next = post_delete[0]
|
||
|
if next is DEL_START or not next.startswith('<'):
|
||
|
# Reached a word, we can't move the delete text forward
|
||
|
break
|
||
|
if next[1] == '/':
|
||
|
# Reached a closing tag, can we go further? Maybe not...
|
||
|
break
|
||
|
name = next.split()[0].strip('<>')
|
||
|
if name == 'ins':
|
||
|
# Can't move into an insert
|
||
|
break
|
||
|
assert name != 'del', (
|
||
|
"Unexpected delete tag: %r" % next)
|
||
|
if name == finding_name:
|
||
|
unbalanced_start.pop(0)
|
||
|
pre_delete.append(post_delete.pop(0))
|
||
|
else:
|
||
|
# Found a tag that doesn't match
|
||
|
break
|
||
|
|
||
|
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
|
||
|
""" like locate_unbalanced_start, except handling end tags and
|
||
|
possibly moving the point earlier in the document. """
|
||
|
while 1:
|
||
|
if not unbalanced_end:
|
||
|
# Success
|
||
|
break
|
||
|
finding = unbalanced_end[-1]
|
||
|
finding_name = finding.split()[0].strip('<>/')
|
||
|
if not pre_delete:
|
||
|
break
|
||
|
next = pre_delete[-1]
|
||
|
if next is DEL_END or not next.startswith('</'):
|
||
|
# A word or a start tag
|
||
|
break
|
||
|
name = next.split()[0].strip('<>/')
|
||
|
if name == 'ins' or name == 'del':
|
||
|
# Can't move into an insert or delete
|
||
|
break
|
||
|
if name == finding_name:
|
||
|
unbalanced_end.pop()
|
||
|
post_delete.insert(0, pre_delete.pop())
|
||
|
else:
|
||
|
# Found a tag that doesn't match
|
||
|
break
|
||
|
|
||
|
class token(_unicode):
|
||
|
""" Represents a diffable token, generally a word that is displayed to
|
||
|
the user. Opening tags are attached to this token when they are
|
||
|
adjacent (pre_tags) and closing tags that follow the word
|
||
|
(post_tags). Some exceptions occur when there are empty tags
|
||
|
adjacent to a word, so there may be close tags in pre_tags, or
|
||
|
open tags in post_tags.
|
||
|
|
||
|
We also keep track of whether the word was originally followed by
|
||
|
whitespace, even though we do not want to treat the word as
|
||
|
equivalent to a similar word that does not have a trailing
|
||
|
space."""
|
||
|
|
||
|
# When this is true, the token will be eliminated from the
|
||
|
# displayed diff if no change has occurred:
|
||
|
hide_when_equal = False
|
||
|
|
||
|
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
|
||
|
obj = _unicode.__new__(cls, text)
|
||
|
|
||
|
if pre_tags is not None:
|
||
|
obj.pre_tags = pre_tags
|
||
|
else:
|
||
|
obj.pre_tags = []
|
||
|
|
||
|
if post_tags is not None:
|
||
|
obj.post_tags = post_tags
|
||
|
else:
|
||
|
obj.post_tags = []
|
||
|
|
||
|
obj.trailing_whitespace = trailing_whitespace
|
||
|
|
||
|
return obj
|
||
|
|
||
|
def __repr__(self):
|
||
|
return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
|
||
|
self.post_tags, self.trailing_whitespace)
|
||
|
|
||
|
def html(self):
|
||
|
return _unicode(self)
|
||
|
|
||
|
class tag_token(token):
|
||
|
|
||
|
""" Represents a token that is actually a tag. Currently this is just
|
||
|
the <img> tag, which takes up visible space just like a word but
|
||
|
is only represented in a document by a tag. """
|
||
|
|
||
|
def __new__(cls, tag, data, html_repr, pre_tags=None,
|
||
|
post_tags=None, trailing_whitespace=""):
|
||
|
obj = token.__new__(cls, "%s: %s" % (type, data),
|
||
|
pre_tags=pre_tags,
|
||
|
post_tags=post_tags,
|
||
|
trailing_whitespace=trailing_whitespace)
|
||
|
obj.tag = tag
|
||
|
obj.data = data
|
||
|
obj.html_repr = html_repr
|
||
|
return obj
|
||
|
|
||
|
def __repr__(self):
|
||
|
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
|
||
|
self.tag,
|
||
|
self.data,
|
||
|
self.html_repr,
|
||
|
self.pre_tags,
|
||
|
self.post_tags,
|
||
|
self.trailing_whitespace)
|
||
|
def html(self):
|
||
|
return self.html_repr
|
||
|
|
||
|
class href_token(token):
|
||
|
|
||
|
""" Represents the href in an anchor tag. Unlike other words, we only
|
||
|
show the href when it changes. """
|
||
|
|
||
|
hide_when_equal = True
|
||
|
|
||
|
def html(self):
|
||
|
return ' Link: %s' % self
|
||
|
|
||
|
def tokenize(html, include_hrefs=True):
|
||
|
"""
|
||
|
Parse the given HTML and returns token objects (words with attached tags).
|
||
|
|
||
|
This parses only the content of a page; anything in the head is
|
||
|
ignored, and the <head> and <body> elements are themselves
|
||
|
optional. The content is then parsed by lxml, which ensures the
|
||
|
validity of the resulting parsed document (though lxml may make
|
||
|
incorrect guesses when the markup is particular bad).
|
||
|
|
||
|
<ins> and <del> tags are also eliminated from the document, as
|
||
|
that gets confusing.
|
||
|
|
||
|
If include_hrefs is true, then the href attribute of <a> tags is
|
||
|
included as a special kind of diffable token."""
|
||
|
if etree.iselement(html):
|
||
|
body_el = html
|
||
|
else:
|
||
|
body_el = parse_html(html, cleanup=True)
|
||
|
# Then we split the document into text chunks for each tag, word, and end tag:
|
||
|
chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
|
||
|
# Finally re-joining them into token objects:
|
||
|
return fixup_chunks(chunks)
|
||
|
|
||
|
def parse_html(html, cleanup=True):
|
||
|
"""
|
||
|
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
|
||
|
wrapped in a <div> tag that was not in the original document.
|
||
|
|
||
|
If cleanup is true, make sure there's no <head> or <body>, and get
|
||
|
rid of any <ins> and <del> tags.
|
||
|
"""
|
||
|
if cleanup:
|
||
|
# This removes any extra markup or structure like <head>:
|
||
|
html = cleanup_html(html)
|
||
|
return fragment_fromstring(html, create_parent=True)
|
||
|
|
||
|
_body_re = re.compile(r'<body.*?>', re.I|re.S)
|
||
|
_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
|
||
|
_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
|
||
|
|
||
|
def cleanup_html(html):
|
||
|
""" This 'cleans' the HTML, meaning that any page structure is removed
|
||
|
(only the contents of <body> are used, if there is any <body).
|
||
|
Also <ins> and <del> tags are removed. """
|
||
|
match = _body_re.search(html)
|
||
|
if match:
|
||
|
html = html[match.end():]
|
||
|
match = _end_body_re.search(html)
|
||
|
if match:
|
||
|
html = html[:match.start()]
|
||
|
html = _ins_del_re.sub('', html)
|
||
|
return html
|
||
|
|
||
|
|
||
|
end_whitespace_re = re.compile(r'[ \t\n\r]$')
|
||
|
|
||
|
def split_trailing_whitespace(word):
|
||
|
"""
|
||
|
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
|
||
|
"""
|
||
|
stripped_length = len(word.rstrip())
|
||
|
return word[0:stripped_length], word[stripped_length:]
|
||
|
|
||
|
|
||
|
def fixup_chunks(chunks):
|
||
|
"""
|
||
|
This function takes a list of chunks and produces a list of tokens.
|
||
|
"""
|
||
|
tag_accum = []
|
||
|
cur_word = None
|
||
|
result = []
|
||
|
for chunk in chunks:
|
||
|
if isinstance(chunk, tuple):
|
||
|
if chunk[0] == 'img':
|
||
|
src = chunk[1]
|
||
|
tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
|
||
|
cur_word = tag_token('img', src, html_repr=tag,
|
||
|
pre_tags=tag_accum,
|
||
|
trailing_whitespace=trailing_whitespace)
|
||
|
tag_accum = []
|
||
|
result.append(cur_word)
|
||
|
|
||
|
elif chunk[0] == 'href':
|
||
|
href = chunk[1]
|
||
|
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
|
||
|
tag_accum = []
|
||
|
result.append(cur_word)
|
||
|
continue
|
||
|
|
||
|
if is_word(chunk):
|
||
|
chunk, trailing_whitespace = split_trailing_whitespace(chunk)
|
||
|
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
|
||
|
tag_accum = []
|
||
|
result.append(cur_word)
|
||
|
|
||
|
elif is_start_tag(chunk):
|
||
|
tag_accum.append(chunk)
|
||
|
|
||
|
elif is_end_tag(chunk):
|
||
|
if tag_accum:
|
||
|
tag_accum.append(chunk)
|
||
|
else:
|
||
|
assert cur_word, (
|
||
|
"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
|
||
|
% (cur_word, result, chunk, chunks))
|
||
|
cur_word.post_tags.append(chunk)
|
||
|
else:
|
||
|
assert False
|
||
|
|
||
|
if not result:
|
||
|
return [token('', pre_tags=tag_accum)]
|
||
|
else:
|
||
|
result[-1].post_tags.extend(tag_accum)
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
# All the tags in HTML that don't require end tags:
|
||
|
empty_tags = (
|
||
|
'param', 'img', 'area', 'br', 'basefont', 'input',
|
||
|
'base', 'meta', 'link', 'col')
|
||
|
|
||
|
block_level_tags = (
|
||
|
'address',
|
||
|
'blockquote',
|
||
|
'center',
|
||
|
'dir',
|
||
|
'div',
|
||
|
'dl',
|
||
|
'fieldset',
|
||
|
'form',
|
||
|
'h1',
|
||
|
'h2',
|
||
|
'h3',
|
||
|
'h4',
|
||
|
'h5',
|
||
|
'h6',
|
||
|
'hr',
|
||
|
'isindex',
|
||
|
'menu',
|
||
|
'noframes',
|
||
|
'noscript',
|
||
|
'ol',
|
||
|
'p',
|
||
|
'pre',
|
||
|
'table',
|
||
|
'ul',
|
||
|
)
|
||
|
|
||
|
block_level_container_tags = (
|
||
|
'dd',
|
||
|
'dt',
|
||
|
'frameset',
|
||
|
'li',
|
||
|
'tbody',
|
||
|
'td',
|
||
|
'tfoot',
|
||
|
'th',
|
||
|
'thead',
|
||
|
'tr',
|
||
|
)
|
||
|
|
||
|
|
||
|
def flatten_el(el, include_hrefs, skip_tag=False):
|
||
|
""" Takes an lxml element el, and generates all the text chunks for
|
||
|
that tag. Each start tag is a chunk, each word is a chunk, and each
|
||
|
end tag is a chunk.
|
||
|
|
||
|
If skip_tag is true, then the outermost container tag is
|
||
|
not returned (just its contents)."""
|
||
|
if not skip_tag:
|
||
|
if el.tag == 'img':
|
||
|
yield ('img', el.get('src'), start_tag(el))
|
||
|
else:
|
||
|
yield start_tag(el)
|
||
|
if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
|
||
|
return
|
||
|
start_words = split_words(el.text)
|
||
|
for word in start_words:
|
||
|
yield html_escape(word)
|
||
|
for child in el:
|
||
|
for item in flatten_el(child, include_hrefs=include_hrefs):
|
||
|
yield item
|
||
|
if el.tag == 'a' and el.get('href') and include_hrefs:
|
||
|
yield ('href', el.get('href'))
|
||
|
if not skip_tag:
|
||
|
yield end_tag(el)
|
||
|
end_words = split_words(el.tail)
|
||
|
for word in end_words:
|
||
|
yield html_escape(word)
|
||
|
|
||
|
split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
|
||
|
|
||
|
def split_words(text):
|
||
|
""" Splits some text into words. Includes trailing whitespace
|
||
|
on each word when appropriate. """
|
||
|
if not text or not text.strip():
|
||
|
return []
|
||
|
|
||
|
words = split_words_re.findall(text)
|
||
|
return words
|
||
|
|
||
|
start_whitespace_re = re.compile(r'^[ \t\n\r]')
|
||
|
|
||
|
def start_tag(el):
|
||
|
"""
|
||
|
The text representation of the start tag for a tag.
|
||
|
"""
|
||
|
return '<%s%s>' % (
|
||
|
el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
|
||
|
for name, value in el.attrib.items()]))
|
||
|
|
||
|
def end_tag(el):
|
||
|
""" The text representation of an end tag for a tag. Includes
|
||
|
trailing whitespace when appropriate. """
|
||
|
if el.tail and start_whitespace_re.search(el.tail):
|
||
|
extra = ' '
|
||
|
else:
|
||
|
extra = ''
|
||
|
return '</%s>%s' % (el.tag, extra)
|
||
|
|
||
|
def is_word(tok):
|
||
|
return not tok.startswith('<')
|
||
|
|
||
|
def is_end_tag(tok):
|
||
|
return tok.startswith('</')
|
||
|
|
||
|
def is_start_tag(tok):
|
||
|
return tok.startswith('<') and not tok.startswith('</')
|
||
|
|
||
|
def fixup_ins_del_tags(html):
|
||
|
""" Given an html string, move any <ins> or <del> tags inside of any
|
||
|
block-level elements, e.g. transform <ins><p>word</p></ins> to
|
||
|
<p><ins>word</ins></p> """
|
||
|
doc = parse_html(html, cleanup=False)
|
||
|
_fixup_ins_del_tags(doc)
|
||
|
html = serialize_html_fragment(doc, skip_outer=True)
|
||
|
return html
|
||
|
|
||
|
def serialize_html_fragment(el, skip_outer=False):
|
||
|
""" Serialize a single lxml element as HTML. The serialized form
|
||
|
includes the elements tail.
|
||
|
|
||
|
If skip_outer is true, then don't serialize the outermost tag
|
||
|
"""
|
||
|
assert not isinstance(el, basestring), (
|
||
|
"You should pass in an element, not a string like %r" % el)
|
||
|
html = etree.tostring(el, method="html", encoding=_unicode)
|
||
|
if skip_outer:
|
||
|
# Get rid of the extra starting tag:
|
||
|
html = html[html.find('>')+1:]
|
||
|
# Get rid of the extra end tag:
|
||
|
html = html[:html.rfind('<')]
|
||
|
return html.strip()
|
||
|
else:
|
||
|
return html
|
||
|
|
||
|
def _fixup_ins_del_tags(doc):
|
||
|
"""fixup_ins_del_tags that works on an lxml document in-place
|
||
|
"""
|
||
|
for tag in ['ins', 'del']:
|
||
|
for el in doc.xpath('descendant-or-self::%s' % tag):
|
||
|
if not _contains_block_level_tag(el):
|
||
|
continue
|
||
|
_move_el_inside_block(el, tag=tag)
|
||
|
el.drop_tag()
|
||
|
#_merge_element_contents(el)
|
||
|
|
||
|
def _contains_block_level_tag(el):
|
||
|
"""True if the element contains any block-level elements, like <p>, <td>, etc.
|
||
|
"""
|
||
|
if el.tag in block_level_tags or el.tag in block_level_container_tags:
|
||
|
return True
|
||
|
for child in el:
|
||
|
if _contains_block_level_tag(child):
|
||
|
return True
|
||
|
return False
|
||
|
|
||
|
def _move_el_inside_block(el, tag):
|
||
|
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
|
||
|
and moves them inside any block-level tags. """
|
||
|
for child in el:
|
||
|
if _contains_block_level_tag(child):
|
||
|
break
|
||
|
else:
|
||
|
# No block-level tags in any child
|
||
|
children_tag = etree.Element(tag)
|
||
|
children_tag.text = el.text
|
||
|
el.text = None
|
||
|
children_tag.extend(list(el))
|
||
|
el[:] = [children_tag]
|
||
|
return
|
||
|
for child in list(el):
|
||
|
if _contains_block_level_tag(child):
|
||
|
_move_el_inside_block(child, tag)
|
||
|
if child.tail:
|
||
|
tail_tag = etree.Element(tag)
|
||
|
tail_tag.text = child.tail
|
||
|
child.tail = None
|
||
|
el.insert(el.index(child)+1, tail_tag)
|
||
|
else:
|
||
|
child_tag = etree.Element(tag)
|
||
|
el.replace(child, child_tag)
|
||
|
child_tag.append(child)
|
||
|
if el.text:
|
||
|
text_tag = etree.Element(tag)
|
||
|
text_tag.text = el.text
|
||
|
el.text = None
|
||
|
el.insert(0, text_tag)
|
||
|
|
||
|
def _merge_element_contents(el):
|
||
|
"""
|
||
|
Removes an element, but merges its contents into its place, e.g.,
|
||
|
given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
|
||
|
<p>Hi there!</p>
|
||
|
"""
|
||
|
parent = el.getparent()
|
||
|
text = el.text or ''
|
||
|
if el.tail:
|
||
|
if not len(el):
|
||
|
text += el.tail
|
||
|
else:
|
||
|
if el[-1].tail:
|
||
|
el[-1].tail += el.tail
|
||
|
else:
|
||
|
el[-1].tail = el.tail
|
||
|
index = parent.index(el)
|
||
|
if text:
|
||
|
if index == 0:
|
||
|
previous = None
|
||
|
else:
|
||
|
previous = parent[index-1]
|
||
|
if previous is None:
|
||
|
if parent.text:
|
||
|
parent.text += text
|
||
|
else:
|
||
|
parent.text = text
|
||
|
else:
|
||
|
if previous.tail:
|
||
|
previous.tail += text
|
||
|
else:
|
||
|
previous.tail = text
|
||
|
parent[index:index+1] = el.getchildren()
|
||
|
|
||
|
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
|
||
|
"""
|
||
|
Acts like SequenceMatcher, but tries not to find very small equal
|
||
|
blocks amidst large spans of changes
|
||
|
"""
|
||
|
|
||
|
threshold = 2
|
||
|
|
||
|
def get_matching_blocks(self):
|
||
|
size = min(len(self.b), len(self.b))
|
||
|
threshold = min(self.threshold, size / 4)
|
||
|
actual = difflib.SequenceMatcher.get_matching_blocks(self)
|
||
|
return [item for item in actual
|
||
|
if item[2] > threshold
|
||
|
or not item[2]]
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
from lxml.html import _diffcommand
|
||
|
_diffcommand.main()
|
||
|
|