diff --git a/htmltreediff/html.py b/htmltreediff/html.py index 19b15e4..eaab143 100644 --- a/htmltreediff/html.py +++ b/htmltreediff/html.py @@ -117,25 +117,26 @@ def fix_lists(dom): wrap_inner(c, 'del') -def fix_tables(dom): - _internalize_changes_markup(dom, set(['td', 'th'])) - - # Show table row insertions - tags = set() - for node in list(dom.getElementsByTagName('tr')): - parent = node.parentNode - if parent.tagName in ('ins', 'del'): - tags.add(parent) - for tag in tags: - distribute(tag) - # Show table cell insertions +def distribute_ins_and_del_tags(dom, tag_names): tags = set() - for node in list(dom.getElementsByTagName('td') + dom.getElementsByTagName('th')): - parent = node.parentNode - if parent.tagName in ('ins', 'del'): - tags.add(parent) + for tag_name in tag_names: + for node in list(dom.getElementsByTagName(tag_name)): + parent = node.parentNode + if parent.tagName in ('ins', 'del'): + tags.add(parent) for tag in tags: distribute(tag) + + +def fix_tables(dom): + _internalize_changes_markup(dom, set(['tbody', 'thead', 'tfoot'])) + _internalize_changes_markup(dom, set(['tr'])) + _internalize_changes_markup(dom, set(['td', 'th'])) + + distribute_ins_and_del_tags(dom, ['tbody', 'thead', 'tfoot']) + distribute_ins_and_del_tags(dom, ['tr']) + distribute_ins_and_del_tags(dom, ['td', 'th']) + # All other ins and del tags inside a table but not in a cell are invalid, # so remove them. for node in list(dom.getElementsByTagName('ins') + dom.getElementsByTagName('del')): diff --git a/htmltreediff/test_html.py b/htmltreediff/test_html.py index f57c9c5..3235fb3 100644 --- a/htmltreediff/test_html.py +++ b/htmltreediff/test_html.py @@ -552,6 +552,156 @@ def test_fix_tables(): ''' ), + ( + 'tbody inside ins is distributed', + ''' + + +
A
+ ''', + ''' + + +
A
+ ''' + ), + ( + 'tbody inside del is distributed', + ''' + + +
A
+ ''', + ''' + + +
A
+ ''' + ), + ( + 'thead inside ins is distributed', + ''' + + + +
Header
Data
+ ''', + ''' + + + +
Header
Data
+ ''' + ), + ( + 'thead inside del is distributed', + ''' + + + +
Header
Data
+ ''', + ''' + + + +
Header
Data
+ ''' + ), + ( + 'tfoot inside ins is distributed', + ''' + + + +
Data
Footer
+ ''', + ''' + + + +
Data
Footer
+ ''' + ), + ( + 'tfoot inside del is distributed', + ''' + + + +
Data
Footer
+ ''', + ''' + + + +
Data
Footer
+ ''' + ), + ( + 'tbody del and ins pair is internalized', + ''' + + + +
old data
new data
+ ''', + ''' + + +
old datanew data
+ ''' + ), + ( + 'thead del and ins pair is internalized', + ''' + + + + +
old header
new header
data
+ ''', + ''' + + + +
old headernew header
data
+ ''' + ), + ( + 'tfoot del and ins pair is internalized', + ''' + + + + +
data
old footer
new footer
+ ''', + ''' + + + +
data
old footernew footer
+ ''' + ), + ( + 'tr del and ins pair is internalized', + ''' + + + + + +
old row
new row
+ ''', + ''' + + + + +
old rownew row
+ ''' + ), ( 'remove ins and del tags at the wrong level of the table', ''' diff --git a/setup.py b/setup.py index 6001d3b..ba9d89c 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def get_requirements(path): setup( name="html-tree-diff", - version="0.3.0", + version="0.3.1", description="Structure-aware diff for html and xml documents", author="Christian Oudard", author_email="christian.oudard@gmail.com",