openRuyi-Project · HNO3Miracle · Apr 30, 2026 · Apr 30, 2026
diff --git a/SPECS/python-beautifulsoup4/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch b/SPECS/python-beautifulsoup4/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
@@ -0,0 +1,83 @@
+From ec4a722af07341c4aa3fe604b077a1f773c6fdd2 Mon Sep 17 00:00:00 2001
+From: Leonard Richardson <leonardr@segfault.org>
+Date: Sun, 7 Dec 2025 13:10:42 -0500
+Subject: [PATCH] * Skip the lxml tree builder's
+ test_surrogate_in_character_reference test   if the libxml2 version is less
+ than 2.13.0. Prior versions of libxml2   don't issue the REPLACEMENT
+ CHARACTER we're expecting. [bug=2134346]
+
+---
+ CHANGELOG              |  6 ++++++
+ bs4/tests/test_lxml.py | 14 +++++++++++++-
+ tox.ini                |  2 +-
+ 3 files changed, 20 insertions(+), 2 deletions(-)
+
+diff --git a/CHANGELOG b/CHANGELOG
+index 544f128..f61b7e9 100644
+--- a/CHANGELOG
++++ b/CHANGELOG
+@@ -1,3 +1,9 @@
++= Unreleased
++
++* Skip the lxml tree builder's test_surrogate_in_character_reference test
++  if the libxml2 version is less than 2.13.0. Prior versions of libxml2
++  don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
++
+ = 4.14.3 (20251130)
+
+ * When using one of the lxml tree builders, you can pass in
+diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
+index 0b69956..aa82143 100644
+--- a/bs4/tests/test_lxml.py
++++ b/bs4/tests/test_lxml.py
+@@ -7,6 +7,7 @@ from . import LXML_PRESENT, LXML_VERSION
+
+ if LXML_PRESENT:
+     from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML
++    from lxml import etree
+
+ from bs4 import (
+     BeautifulStoneSoup,
+@@ -47,7 +48,6 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
+
+     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
+     # test if an old version of lxml is installed.
+-
+     @pytest.mark.skipif(
+         not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0),
+         reason="Skipping doctype test for old version of lxml to avoid segfault.",
+@@ -57,6 +57,18 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest):
+         doctype = soup.contents[0]
+         assert "" == doctype.strip()
+
++    # This is a copy of the HTMLTreeBuilderSmokeTest implementation.
++    # For lxml only, we need to skip the test if the libxml2 version doesn't
++    # have the fix from https://gitlab.gnome.org/GNOME/libxml2/-/commit/4dcc2d743eb83b8aaec0d91660d615fdb024dad0. That means any pre-2.13 version.
++    @pytest.mark.skipif(
++        "etree.LIBXML_VERSION < (2, 13, 0)",
++        reason="libxml version doesn't issue REPLACEMENT CHARACTER",
++    )
++    def test_surrogate_in_character_reference(self):
++        # These character references are invalid and should be replaced with REPLACEMENT CHARACTER.
++        soup = self.soup("<html><body>&#55357;&#56551;</body></html>")
++        assert soup.body.contents == ['��']
++
+     def test_beautifulstonesoup_is_xml_parser(self):
+         # Make sure that the deprecated BSS class uses an xml builder
+         # if one is installed.
+diff --git a/tox.ini b/tox.ini
+index c53e4d8..c60c3e7 100644
+--- a/tox.ini
++++ b/tox.ini
+@@ -2,7 +2,7 @@
+ # encoding autodetection libraries: cchardet, chardet, and charset-normalizer
+ [tox]
+ env_list =
+-    py{37, 38, 39, 310, 311, 312, 313},bare,docs
++    py{37, 38, 39, 310, 311, 312, 313, 314},bare,docs
+ minversion = 3.28.0
+ skip_missing_interpreters = true
+
+-- 
+2.52.0
+
diff --git a/SPECS/python-beautifulsoup4/0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch b/SPECS/python-beautifulsoup4/0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch
@@ -0,0 +1,162 @@
+From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001
+From: Leonard Richardson <leonardr@segfault.org>
+Date: Mon, 8 Dec 2025 19:34:16 -0500
+Subject: [PATCH] * Change the html.parser tree builder's code for handling
+ numeric   character references, to avoid a crash when using Python versions  
+ that include the fix to Python issue https://bugs.python.org/issue13633  
+ (e.g. Python 3.11.13). [bug=2134393]
+
+---
+ CHANGELOG                    |  5 +++
+ bs4/builder/_htmlparser.py   | 78 +++++++++++++++++++++++++++++-------
+ bs4/tests/test_htmlparser.py | 17 ++++++++
+ 3 files changed, 86 insertions(+), 14 deletions(-)
+
+diff --git a/CHANGELOG b/CHANGELOG
+index f61b7e9..606e9f5 100644
+--- a/CHANGELOG
++++ b/CHANGELOG
+@@ -1,5 +1,10 @@
+ = Unreleased
+
++* Change the html.parser tree builder's code for handling numeric
++  character references, to avoid a crash when using Python versions
++  that include the fix to Python issue https://bugs.python.org/issue13633
++  (e.g. Python 3.11.13). [bug=2134393]
++
+ * Skip the lxml tree builder's test_surrogate_in_character_reference test
+   if the libxml2 version is less than 2.13.0. Prior versions of libxml2
+   don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346]
+diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
+index 165a3d8..ead800f 100644
+--- a/bs4/builder/_htmlparser.py
++++ b/bs4/builder/_htmlparser.py
+@@ -10,6 +10,7 @@ __all__ = [
+ ]
+
+ from html.parser import HTMLParser
++import re
+
+ from typing import (
+     Any,
+@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
+         """Handle some textual data that shows up between tags."""
+         self.soup.handle_data(data)
+
++    _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)")
++    _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)")
++
++    @classmethod
++    def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]:
++        """Convert a numeric character reference into an actual character.
++
++        :param name: The number of the character reference, as
++          obtained by html.parser
++
++        :return: A 3-tuple (dereferenced, replacement_added,
++          extra_data). `dereferenced` is the dereferenced character
++          reference, or the empty string if there was no
++          reference. `replacement_added` is True if the reference
++          could only be dereferenced by replacing content with U+FFFD
++          REPLACEMENT CHARACTER. `extra_data` is a portion of data
++          following the character reference, which was deemed to be
++          normal data and not part of the reference at all.
++        """
++        dereferenced:str = ""
++        replacement_added:bool = False
++        extra_data:str = ""
++
++        base:int = 10
++        reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA
++        if name.startswith("x") or name.startswith("X"):
++            # Hex reference
++            name = name[1:]
++            base = 16
++            reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA
++
++        real_name:Optional[int] = None
++        try:
++            real_name = int(name, base)
++        except ValueError:
++            # This is either bad data that starts with what looks like
++            # a numeric character reference, or a real numeric
++            # reference that wasn't terminated by a semicolon.
++            #
++            # The fix to https://bugs.python.org/issue13633 made it
++            # our responsibility to handle the extra data.
++            #
++            # To preserve the old behavior, we extract the numeric
++            # portion of the incoming "reference" and treat that as a
++            # numeric reference. All subsequent data will be processed
++            # as string data.
++            match = reg.search(name)
++            if match is not None:
++                real_name = int(match.groups()[0], base)
++                extra_data = match.groups()[1]
++
++        if real_name is None:
++            dereferenced = ""
++            extra_data = name
++        else:
++            dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
++        return dereferenced, replacement_added, extra_data
++
+     def handle_charref(self, name: str) -> None:
+         """Handle a numeric character reference by converting it to the
+         corresponding Unicode character and treating it as textual
+@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
+
+         :param name: Character number, possibly in hexadecimal.
+         """
+-        # TODO: This was originally a workaround for a bug in
+-        # HTMLParser. (http://bugs.python.org/issue13633) The bug has
+-        # been fixed, but removing this code still makes some
+-        # Beautiful Soup tests fail. This needs investigation.
+-        real_name:int
+-        if name.startswith("x"):
+-            real_name = int(name.lstrip("x"), 16)
+-        elif name.startswith("X"):
+-            real_name = int(name.lstrip("X"), 16)
+-        else:
+-            real_name = int(name)
+-
+-        data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
++        dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name)
+         if replacement_added:
+             self.soup.contains_replacement_characters = True
+-        self.handle_data(data)
++        if dereferenced is not None:
++            self.handle_data(dereferenced)
++        if extra_data is not None:
++            self.handle_data(extra_data)
+
+     def handle_entityref(self, name: str) -> None:
+         """Handle a named entity reference by converting it to the
+diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
+index 0086a9d..cb85b53 100644
+--- a/bs4/tests/test_htmlparser.py
++++ b/bs4/tests/test_htmlparser.py
+@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest):
+         # Since we do the replacement ourselves, we can set contains_replacement_characters appropriately.
+         # lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER.
+         assert soup.contains_replacement_characters == True
++
++class TestBeautifulSoupHTMLParser:
++    def test_dereference_numeric_character_reference(self):
++        m = BeautifulSoupHTMLParser._dereference_numeric_character_reference
++        assert m("64") == ("@", False, "")
++        assert m("x64") == ("d", False, "")
++        assert m("X64") == ("d", False, "")
++        assert m("64andsomeextra") == ("@", False, "andsomeextra")
++        assert m("") == ("", False, "")
++        assert m("00whee") == ("�", True, "whee")
++        assert m("xfffdthatsit") == ("�", False, "thatsit")
++        assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra")
++        assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric")
++
++        # These are almost certainly wrong but at least it doesn't crash.
++        assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra")
++        assert m("xffffffffffffffffffffffbeep") == ("�", True, "p")
+-- 
+2.52.0
+
diff --git a/SPECS/python-beautifulsoup4/2000-python-beautifulsoup4-4.14.3-test.patch b/SPECS/python-beautifulsoup4/2000-python-beautifulsoup4-4.14.3-test.patch
@@ -0,0 +1,25 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: HNO3Miracle <xiangao.or@isrc.iscas.ac.cn>
+Date: Tue, 14 Apr 2026 16:30:00 +0800
+Subject: [PATCH] python-beautifulsoup4: adjust surrogate-character test for
+ Python 3.15
+
+---
+ bs4/tests/__init__.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py
+index 3650371..2b16f8a 100644
+--- a/bs4/tests/__init__.py
++++ b/bs4/tests/__init__.py
+@@ -1125,7 +1125,7 @@ Hello, world!
+         soup = self.soup(BAD_DOCUMENT)
+         self.linkage_validator(soup)
+
+-    @pytest.mark.skipif("sys.version_info < (3, 8)")
++    @pytest.mark.skipif("sys.version_info < (3, 15)")
+     def test_surrogate_in_character_reference(self):
+        # These character references are invalid and should be replaced with REPLACEMENT CHARACTER.
+        soup = self.soup("<html><body>&#55357;&#56551;</body></html>")
+--
+2.51.0
diff --git a/SPECS/python-beautifulsoup4/python-beautifulsoup4.spec b/SPECS/python-beautifulsoup4/python-beautifulsoup4.spec
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: (C) 2026 Institute of Software, Chinese Academy of Sciences (ISCAS)
+# SPDX-FileCopyrightText: (C) 2026 openRuyi Project Contributors
+# SPDX-FileContributor: HNO3Miracle <xiangao.or@isrc.iscas.ac.cn>
+#
+# SPDX-License-Identifier: MulanPSL-2.0
+
+%global srcname beautifulsoup4
+
+Name:           python-beautifulsoup4
+Version:        4.14.3
+Release:        %autorelease
+Summary:        HTML/XML parser for quick-turnaround applications like screen-scraping
+License:        MIT
+URL:            http://www.crummy.com/software/BeautifulSoup/
+#!RemoteAsset:  sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86
+Source0:        https://files.pythonhosted.org/packages/source/b/%{srcname}/%{srcname}-%{version}.tar.gz
+BuildArch:      noarch
+BuildSystem:    pyproject
+
+BuildOption(install):  -l bs4
+
+# Patches from upstream
+# https://git.launchpad.net/beautifulsoup/commit/?id=ec4a722af07341c4aa3fe604b077a1f773c6fdd2
+# Skip the lxml surrogate-character test with older libxml2 releases
+Patch0:         0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch
+# https://git.launchpad.net/beautifulsoup/commit/?id=55f655ffb7ef03bdd1df0f013743831fe54e3c7a
+# Fix html.parser numeric character reference handling for newer Python
+Patch1:         0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch
+# Local openRuyi patch to adjust the surrogate-character test expectation for Python 3.15
+Patch2000:      2000-python-beautifulsoup4-4.14.3-test.patch
+
+BuildRequires:  pyproject-rpm-macros
+BuildRequires:  pkgconfig(python3)
+BuildRequires:  python3dist(pip)
+BuildRequires:  python3dist(setuptools)
+BuildRequires:  python3dist(hatchling)
+BuildRequires:  python3dist(typing-extensions)
+BuildRequires:  python3dist(packaging)
+BuildRequires:  python3dist(soupsieve)
+BuildRequires:  python3dist(pytest)
+BuildRequires:  python3dist(html5lib)
+BuildRequires:  python3dist(lxml)
+
+Provides:       python3-beautifulsoup4 = %{version}-%{release}
+%python_provide python3-beautifulsoup4
+
+%description
+Beautiful Soup is a Python HTML/XML parser designed for quick
+turnaround projects like screen-scraping.
+
+%generate_buildrequires
+%pyproject_buildrequires
+
+%prep
+%autosetup -p1 -n %{srcname}-%{version}
+
+# Physically remove tox to be sure
+rm -f tox.ini
+
+%check -a
+%pytest
+
+%files -f %{pyproject_files}
+%doc NEWS.txt CHANGELOG
+%license LICENSE
+
+%changelog
+%autochangelog