From f5be3d4938262ea8ef379087c66f73d90e43ca97 Mon Sep 17 00:00:00 2001 From: Stefan Brand Date: Tue, 7 Apr 2026 10:16:03 -0400 Subject: [PATCH 1/4] add tests --- htmltreediff/test_html.py | 100 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/htmltreediff/test_html.py b/htmltreediff/test_html.py index f57c9c5..bd66b46 100644 --- a/htmltreediff/test_html.py +++ b/htmltreediff/test_html.py @@ -552,6 +552,106 @@ def test_fix_tables(): ''' ), + ( + 'tbody inside ins is distributed', + ''' + + +
A
+ ''', + ''' + + +
A
+ ''' + ), + ( + 'tbody inside del is distributed', + ''' + + +
A
+ ''', + ''' + + +
A
+ ''' + ), + ( + 'thead inside ins is distributed', + ''' + + + +
Header
Data
+ ''', + ''' + + + +
Header
Data
+ ''' + ), + ( + 'thead inside del is distributed', + ''' + + + +
Header
Data
+ ''', + ''' + + + +
Header
Data
+ ''' + ), + ( + 'tfoot inside ins is distributed', + ''' + + + +
Data
Footer
+ ''', + ''' + + + +
Data
Footer
+ ''' + ), + ( + 'tfoot inside del is distributed', + ''' + + + +
Data
Footer
+ ''', + ''' + + + +
Data
Footer
+ ''' + ), + ( + 'tbody del and ins pair is distributed', + ''' + + + +
old data
new data
+ ''', + ''' + + +
old datanew data
+ ''' + ), ( 'remove ins and del tags at the wrong level of the table', ''' From db2f63e91c17740fe0628065986417827523796a Mon Sep 17 00:00:00 2001 From: Stefan Brand Date: Tue, 7 Apr 2026 10:17:24 -0400 Subject: [PATCH 2/4] prevent changes to tbody, thead, or tfoot from causing whole table to be removed --- htmltreediff/html.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/htmltreediff/html.py b/htmltreediff/html.py index 19b15e4..eaab143 100644 --- a/htmltreediff/html.py +++ b/htmltreediff/html.py @@ -117,25 +117,26 @@ def fix_lists(dom): wrap_inner(c, 'del') -def fix_tables(dom): - _internalize_changes_markup(dom, set(['td', 'th'])) - - # Show table row insertions - tags = set() - for node in list(dom.getElementsByTagName('tr')): - parent = node.parentNode - if parent.tagName in ('ins', 'del'): - tags.add(parent) - for tag in tags: - distribute(tag) - # Show table cell insertions +def distribute_ins_and_del_tags(dom, tag_names): tags = set() - for node in list(dom.getElementsByTagName('td') + dom.getElementsByTagName('th')): - parent = node.parentNode - if parent.tagName in ('ins', 'del'): - tags.add(parent) + for tag_name in tag_names: + for node in list(dom.getElementsByTagName(tag_name)): + parent = node.parentNode + if parent.tagName in ('ins', 'del'): + tags.add(parent) for tag in tags: distribute(tag) + + +def fix_tables(dom): + _internalize_changes_markup(dom, set(['tbody', 'thead', 'tfoot'])) + _internalize_changes_markup(dom, set(['tr'])) + _internalize_changes_markup(dom, set(['td', 'th'])) + + distribute_ins_and_del_tags(dom, ['tbody', 'thead', 'tfoot']) + distribute_ins_and_del_tags(dom, ['tr']) + distribute_ins_and_del_tags(dom, ['td', 'th']) + # All other ins and del tags inside a table but not in a cell are invalid, # so remove them. for node in list(dom.getElementsByTagName('ins') + dom.getElementsByTagName('del')): From 1bc3e137e9bc3f82f2546c7a3bd3e8063365a01c Mon Sep 17 00:00:00 2001 From: Stefan Brand Date: Tue, 7 Apr 2026 11:07:41 -0400 Subject: [PATCH 3/4] add tests for the internalization --- htmltreediff/test_html.py | 52 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/htmltreediff/test_html.py b/htmltreediff/test_html.py index bd66b46..3235fb3 100644 --- a/htmltreediff/test_html.py +++ b/htmltreediff/test_html.py @@ -639,7 +639,7 @@ def test_fix_tables(): ''' ), ( - 'tbody del and ins pair is distributed', + 'tbody del and ins pair is internalized', ''' @@ -652,6 +652,56 @@ def test_fix_tables():
old data
''' ), + ( + 'thead del and ins pair is internalized', + ''' + + + + +
old header
new header
data
+ ''', + ''' + + + +
old headernew header
data
+ ''' + ), + ( + 'tfoot del and ins pair is internalized', + ''' + + + + +
data
old footer
new footer
+ ''', + ''' + + + +
data
old footernew footer
+ ''' + ), + ( + 'tr del and ins pair is internalized', + ''' + + + + + +
old row
new row
+ ''', + ''' + + + + +
old rownew row
+ ''' + ), ( 'remove ins and del tags at the wrong level of the table', ''' From 7d9378d3a7d453d81cf96330a3d9f7b491a5bd54 Mon Sep 17 00:00:00 2001 From: Stefan Brand Date: Tue, 7 Apr 2026 11:15:24 -0400 Subject: [PATCH 4/4] bump the version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6001d3b..ba9d89c 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def get_requirements(path): setup( name="html-tree-diff", - version="0.3.0", + version="0.3.1", description="Structure-aware diff for html and xml documents", author="Christian Oudard", author_email="christian.oudard@gmail.com",