Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
From ec4a722af07341c4aa3fe604b077a1f773c6fdd2 Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Sun, 7 Dec 2025 13:10:42 -0500
Subject: [PATCH] * Skip the lxml tree builder's
test_surrogate_in_character_reference test if the libxml2 version is less
than 2.13.0. Prior versions of libxml2 don't issue the REPLACEMENT
CHARACTER we're expecting. [bug=2134346]

---
CHANGELOG | 6 ++++++
bs4/tests/test_lxml.py | 14 +++++++++++++-
tox.ini | 2 +-
3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 544f128..f61b7e9 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,9 @@
+= Unreleased
+
+* Skip the lxml tree builder's test_surrogate_in_character_reference test
+ if the libxml2 version is less than 2.13.0. Prior versions of libxml2
+ don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
+
= 4.14.3 (20251130)

* When using one of the lxml tree builders, you can pass in
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 0b69956..aa82143 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -7,6 +7,7 @@ from . import LXML_PRESENT, LXML_VERSION

if LXML_PRESENT:
from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML
+ from lxml import etree

from bs4 import (
BeautifulStoneSoup,
@@ -47,7 +48,6 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):

# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
-
@pytest.mark.skipif(
not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0),
reason="Skipping doctype test for old version of lxml to avoid segfault.",
@@ -57,6 +57,18 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
doctype = soup.contents[0]
assert "" == doctype.strip()

+ # This is a copy of the HTMLTreeBuilderSmokeTest implementation.
+ # For lxml only, we need to skip the test if the libxml2 version doesn't
+ # have the fix from https://gitlab.gnome.org/GNOME/libxml2/-/commit/4dcc2d743eb83b8aaec0d91660d615fdb024dad0. That means any pre-2.13 version.
+ @pytest.mark.skipif(
+ "etree.LIBXML_VERSION < (2, 13, 0)",
+ reason="libxml version doesn't issue REPLACEMENT CHARACTER",
+ )
+ def test_surrogate_in_character_reference(self):
+ # These character references are invalid and should be replaced with REPLACEMENT CHARACTER.
+ soup = self.soup("<html><body>&#55357;&#56551;</body></html>")
+ assert soup.body.contents == ['��']
+
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.
diff --git a/tox.ini b/tox.ini
index c53e4d8..c60c3e7 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,7 +2,7 @@
# encoding autodetection libraries: cchardet, chardet, and charset-normalizer
[tox]
env_list =
- py{37, 38, 39, 310, 311, 312, 313},bare,docs
+ py{37, 38, 39, 310, 311, 312, 313, 314},bare,docs
minversion = 3.28.0
skip_missing_interpreters = true

--
2.52.0

Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001
From: Leonard Richardson <leonardr@segfault.org>
Date: Mon, 8 Dec 2025 19:34:16 -0500
Subject: [PATCH] * Change the html.parser tree builder's code for handling
numeric character references, to avoid a crash when using Python versions
that include the fix to Python issue https://bugs.python.org/issue13633
(e.g. Python 3.11.13). [bug=2134393]

---
CHANGELOG | 5 +++
bs4/builder/_htmlparser.py | 78 +++++++++++++++++++++++++++++-------
bs4/tests/test_htmlparser.py | 17 ++++++++
3 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index f61b7e9..606e9f5 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,10 @@
= Unreleased

+* Change the html.parser tree builder's code for handling numeric
+ character references, to avoid a crash when using Python versions
+ that include the fix to Python issue https://bugs.python.org/issue13633
+ (e.g. Python 3.11.13). [bug=2134393]
+
* Skip the lxml tree builder's test_surrogate_in_character_reference test
if the libxml2 version is less than 2.13.0. Prior versions of libxml2
don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 165a3d8..ead800f 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -10,6 +10,7 @@ __all__ = [
]

from html.parser import HTMLParser
+import re

from typing import (
Any,
@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
"""Handle some textual data that shows up between tags."""
self.soup.handle_data(data)

+ _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)")
+ _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)")
+
+ @classmethod
+ def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]:
+ """Convert a numeric character reference into an actual character.
+
+ :param name: The number of the character reference, as
+ obtained by html.parser
+
+ :return: A 3-tuple (dereferenced, replacement_added,
+ extra_data). `dereferenced` is the dereferenced character
+ reference, or the empty string if there was no
+ reference. `replacement_added` is True if the reference
+ could only be dereferenced by replacing content with U+FFFD
+ REPLACEMENT CHARACTER. `extra_data` is a portion of data
+ following the character reference, which was deemed to be
+ normal data and not part of the reference at all.
+ """
+ dereferenced:str = ""
+ replacement_added:bool = False
+ extra_data:str = ""
+
+ base:int = 10
+ reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
+ if name.startswith("x") or name.startswith("X"):
+ # Hex reference
+ name = name[1:]
+ base = 16
+ reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA
+
+ real_name:Optional[int] = None
+ try:
+ real_name = int(name, base)
+ except ValueError:
+ # This is either bad data that starts with what looks like
+ # a numeric character reference, or a real numeric
+ # reference that wasn't terminated by a semicolon.
+ #
+ # The fix to https://bugs.python.org/issue13633 made it
+ # our responsibility to handle the extra data.
+ #
+ # To preserve the old behavior, we extract the numeric
+ # portion of the incoming "reference" and treat that as a
+ # numeric reference. All subsequent data will be processed
+ # as string data.
+ match = reg.search(name)
+ if match is not None:
+ real_name = int(match.groups()[0], base)
+ extra_data = match.groups()[1]
+
+ if real_name is None:
+ dereferenced = ""
+ extra_data = name
+ else:
+ dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
+ return dereferenced, replacement_added, extra_data
+
def handle_charref(self, name: str) -> None:
"""Handle a numeric character reference by converting it to the
corresponding Unicode character and treating it as textual
@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):

:param name: Character number, possibly in hexadecimal.
"""
- # TODO: This was originally a workaround for a bug in
- # HTMLParser. (http://bugs.python.org/issue13633) The bug has
- # been fixed, but removing this code still makes some
- # Beautiful Soup tests fail. This needs investigation.
- real_name:int
- if name.startswith("x"):
- real_name = int(name.lstrip("x"), 16)
- elif name.startswith("X"):
- real_name = int(name.lstrip("X"), 16)
- else:
- real_name = int(name)
-
- data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
+ dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name)
if replacement_added:
self.soup.contains_replacement_characters = True
- self.handle_data(data)
+ if dereferenced is not None:
+ self.handle_data(dereferenced)
+ if extra_data is not None:
+ self.handle_data(extra_data)

def handle_entityref(self, name: str) -> None:
"""Handle a named entity reference by converting it to the
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 0086a9d..cb85b53 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
# Since we do the replacement ourselves, we can set contains_replacement_characters appropriately.
# lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER.
assert soup.contains_replacement_characters == True
+
+class TestBeautifulSoupHTMLParser:
+ def test_dereference_numeric_character_reference(self):
+ m = BeautifulSoupHTMLParser._dereference_numeric_character_reference
+ assert m("64") == ("@", False, "")
+ assert m("x64") == ("d", False, "")
+ assert m("X64") == ("d", False, "")
+ assert m("64andsomeextra") == ("@", False, "andsomeextra")
+ assert m("") == ("", False, "")
+ assert m("00whee") == ("�", True, "whee")
+ assert m("xfffdthatsit") == ("�", False, "thatsit")
+ assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra")
+ assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric")
+
+ # These are almost certainly wrong but at least it doesn't crash.
+ assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra")
+ assert m("xffffffffffffffffffffffbeep") == ("�", True, "p")
--
2.52.0

Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: HNO3Miracle <xiangao.or@isrc.iscas.ac.cn>
Date: Tue, 14 Apr 2026 16:30:00 +0800
Subject: [PATCH] python-beautifulsoup4: adjust surrogate-character test for
Python 3.15

---
bs4/tests/__init__.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
index 3650371..2b16f8a 100644
--- a/bs4/tests/__init__.py
+++ b/bs4/tests/__init__.py
@@ -1125,7 +1125,7 @@ Hello, world!
soup = self.soup(BAD_DOCUMENT)
self.linkage_validator(soup)

- @pytest.mark.skipif("sys.version_info < (3, 8)")
+ @pytest.mark.skipif("sys.version_info < (3, 15)")
def test_surrogate_in_character_reference(self):
# These character references are invalid and should be replaced with REPLACEMENT CHARACTER.
soup = self.soup("<html><body>&#55357;&#56551;</body></html>")
--
2.51.0
68 changes: 68 additions & 0 deletions SPECS/python-beautifulsoup4/python-beautifulsoup4.spec
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# SPDX-FileCopyrightText: (C) 2026 Institute of Software, Chinese Academy of Sciences (ISCAS)
# SPDX-FileCopyrightText: (C) 2026 openRuyi Project Contributors
# SPDX-FileContributor: HNO3Miracle <xiangao.or@isrc.iscas.ac.cn>
#
# SPDX-License-Identifier: MulanPSL-2.0

%global srcname beautifulsoup4

Name: python-beautifulsoup4
Version: 4.14.3
Release: %autorelease
Summary: HTML/XML parser for quick-turnaround applications like screen-scraping
License: MIT
URL: http://www.crummy.com/software/BeautifulSoup/
#!RemoteAsset: sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86
Source0: https://files.pythonhosted.org/packages/source/b/%{srcname}/%{srcname}-%{version}.tar.gz
BuildArch: noarch
BuildSystem: pyproject

BuildOption(install): -l bs4

# Patches from upstream
# https://git.launchpad.net/beautifulsoup/commit/?id=ec4a722af07341c4aa3fe604b077a1f773c6fdd2
# Skip the lxml surrogate-character test with older libxml2 releases
Patch0: 0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
# https://git.launchpad.net/beautifulsoup/commit/?id=55f655ffb7ef03bdd1df0f013743831fe54e3c7a
# Fix html.parser numeric character reference handling for newer Python
Patch1: 0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch
# Local openRuyi patch to adjust the surrogate-character test expectation for Python 3.15
Patch2000: 2000-python-beautifulsoup4-4.14.3-test.patch

BuildRequires: pyproject-rpm-macros
BuildRequires: pkgconfig(python3)
BuildRequires: python3dist(pip)
BuildRequires: python3dist(setuptools)
BuildRequires: python3dist(hatchling)
BuildRequires: python3dist(typing-extensions)
BuildRequires: python3dist(packaging)
BuildRequires: python3dist(soupsieve)
BuildRequires: python3dist(pytest)
BuildRequires: python3dist(html5lib)
BuildRequires: python3dist(lxml)

Provides: python3-beautifulsoup4 = %{version}-%{release}
%python_provide python3-beautifulsoup4

%description
Beautiful Soup is a Python HTML/XML parser designed for quick
turnaround projects like screen-scraping.

%generate_buildrequires
%pyproject_buildrequires

%prep
%autosetup -p1 -n %{srcname}-%{version}

# Physically remove tox to be sure
rm -f tox.ini

%check -a
%pytest

%files -f %{pyproject_files}
%doc NEWS.txt CHANGELOG
%license LICENSE

%changelog
%autochangelog
Loading
Loading