From dbb21ada0105d3d50472340a04c14b8b8749f837 Mon Sep 17 00:00:00 2001 From: HNO3Miracle Date: Thu, 30 Apr 2026 20:24:21 +0800 Subject: [PATCH 1/2] SPECS: Add python-soupsieve. --- SPECS/python-soupsieve/python-soupsieve.spec | 58 ++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 SPECS/python-soupsieve/python-soupsieve.spec diff --git a/SPECS/python-soupsieve/python-soupsieve.spec b/SPECS/python-soupsieve/python-soupsieve.spec new file mode 100644 index 000000000..331486301 --- /dev/null +++ b/SPECS/python-soupsieve/python-soupsieve.spec @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: (C) 2026 Institute of Software, Chinese Academy of Sciences (ISCAS) +# SPDX-FileCopyrightText: (C) 2026 openRuyi Project Contributors +# SPDX-FileContributor: HNO3Miracle +# +# SPDX-License-Identifier: MulanPSL-2.0 + +%bcond tests 0 + +%global srcname soupsieve + +Name: python-soupsieve +Version: 2.8.3 +Release: %autorelease +Summary: A modern CSS selector implementation for Beautiful Soup +License: MIT +URL: https://github.com/facelessuser/soupsieve +#!RemoteAsset: sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349 +Source0: https://files.pythonhosted.org/packages/source/s/%{srcname}/%{srcname}-%{version}.tar.gz +BuildArch: noarch +BuildSystem: pyproject + +BuildOption(install): -l soupsieve + +BuildRequires: pyproject-rpm-macros +BuildRequires: pkgconfig(python3) +BuildRequires: python3dist(setuptools) +%if %{with tests} +BuildRequires: python3dist(pytest) +%endif + +Provides: python3-soupsieve = %{version}-%{release} +%python_provide python3-soupsieve + +%description +Soup Sieve is a CSS selector library designed to be used with Beautiful Soup 4. +It aims to provide selecting, matching, and filtering using modern CSS +selectors. Soup Sieve currently provides selectors from the CSS level 1 +specifications up through the latest CSS level 4 drafts and beyond (though some +are not yet implemented). + +Soup Sieve was written with the intent to replace Beautiful Soup's builtin +select feature, and as of Beautiful Soup version 4.7.0, it now is. Soup Sieve +can also be imported in order to use its API directly for more controlled, +specialized parsing. + +%generate_buildrequires +%pyproject_buildrequires %{?with_tests:-t} + +%check +# The default import check pulls in modules that require bs4, which creates +# a dependency cycle with python-beautifulsoup4 in this PR. + +%files -f %{pyproject_files} +%doc README.md +%license LICENSE.md + +%changelog +%autochangelog From 101727394160caeab69b7776a5d48c427c7fb97c Mon Sep 17 00:00:00 2001 From: HNO3Miracle Date: Thu, 30 Apr 2026 20:24:22 +0800 Subject: [PATCH 2/2] SPECS: Add python-beautifulsoup4. --- ...ee-builder-s-test_surrogate_in_chara.patch | 83 +++++++++ ...parser-tree-builder-s-code-for-handl.patch | 162 ++++++++++++++++++ ...00-python-beautifulsoup4-4.14.3-test.patch | 25 +++ .../python-beautifulsoup4.spec | 68 ++++++++ SPECS/python-soupsieve/python-soupsieve.spec | 2 + 5 files changed, 340 insertions(+) create mode 100644 SPECS/python-beautifulsoup4/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch create mode 100644 SPECS/python-beautifulsoup4/0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch create mode 100644 SPECS/python-beautifulsoup4/2000-python-beautifulsoup4-4.14.3-test.patch create mode 100644 SPECS/python-beautifulsoup4/python-beautifulsoup4.spec diff --git a/SPECS/python-beautifulsoup4/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch b/SPECS/python-beautifulsoup4/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch new file mode 100644 index 000000000..866492fb4 --- /dev/null +++ b/SPECS/python-beautifulsoup4/0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch @@ -0,0 +1,83 @@ +From ec4a722af07341c4aa3fe604b077a1f773c6fdd2 Mon Sep 17 00:00:00 2001 +From: Leonard Richardson +Date: Sun, 7 Dec 2025 13:10:42 -0500 +Subject: [PATCH] * Skip the lxml tree builder's + test_surrogate_in_character_reference test if the libxml2 version is less + than 2.13.0. Prior versions of libxml2 don't issue the REPLACEMENT + CHARACTER we're expecting. [bug=2134346] + +--- + CHANGELOG | 6 ++++++ + bs4/tests/test_lxml.py | 14 +++++++++++++- + tox.ini | 2 +- + 3 files changed, 20 insertions(+), 2 deletions(-) + +diff --git a/CHANGELOG b/CHANGELOG +index 544f128..f61b7e9 100644 +--- a/CHANGELOG ++++ b/CHANGELOG +@@ -1,3 +1,9 @@ ++= Unreleased ++ ++* Skip the lxml tree builder's test_surrogate_in_character_reference test ++ if the libxml2 version is less than 2.13.0. Prior versions of libxml2 ++ don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346] ++ + = 4.14.3 (20251130) + + * When using one of the lxml tree builders, you can pass in +diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py +index 0b69956..aa82143 100644 +--- a/bs4/tests/test_lxml.py ++++ b/bs4/tests/test_lxml.py +@@ -7,6 +7,7 @@ from . import LXML_PRESENT, LXML_VERSION + + if LXML_PRESENT: + from bs4.builder._lxml import LXMLTreeBuilder, LXMLTreeBuilderForXML ++ from lxml import etree + + from bs4 import ( + BeautifulStoneSoup, +@@ -47,7 +48,6 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest): + + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. +- + @pytest.mark.skipif( + not LXML_PRESENT or LXML_VERSION < (2, 3, 5, 0), + reason="Skipping doctype test for old version of lxml to avoid segfault.", +@@ -57,6 +57,18 @@ class TestLXMLTreeBuilder(HTMLTreeBuilderSmokeTest): + doctype = soup.contents[0] + assert "" == doctype.strip() + ++ # This is a copy of the HTMLTreeBuilderSmokeTest implementation. ++ # For lxml only, we need to skip the test if the libxml2 version doesn't ++ # have the fix from https://gitlab.gnome.org/GNOME/libxml2/-/commit/4dcc2d743eb83b8aaec0d91660d615fdb024dad0. That means any pre-2.13 version. ++ @pytest.mark.skipif( ++ "etree.LIBXML_VERSION < (2, 13, 0)", ++ reason="libxml version doesn't issue REPLACEMENT CHARACTER", ++ ) ++ def test_surrogate_in_character_reference(self): ++ # These character references are invalid and should be replaced with REPLACEMENT CHARACTER. ++ soup = self.soup("��") ++ assert soup.body.contents == ['��'] ++ + def test_beautifulstonesoup_is_xml_parser(self): + # Make sure that the deprecated BSS class uses an xml builder + # if one is installed. +diff --git a/tox.ini b/tox.ini +index c53e4d8..c60c3e7 100644 +--- a/tox.ini ++++ b/tox.ini +@@ -2,7 +2,7 @@ + # encoding autodetection libraries: cchardet, chardet, and charset-normalizer + [tox] + env_list = +- py{37, 38, 39, 310, 311, 312, 313},bare,docs ++ py{37, 38, 39, 310, 311, 312, 313, 314},bare,docs + minversion = 3.28.0 + skip_missing_interpreters = true + +-- +2.52.0 + diff --git a/SPECS/python-beautifulsoup4/0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch b/SPECS/python-beautifulsoup4/0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch new file mode 100644 index 000000000..aeae1ff9f --- /dev/null +++ b/SPECS/python-beautifulsoup4/0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch @@ -0,0 +1,162 @@ +From 55f655ffb7ef03bdd1df0f013743831fe54e3c7a Mon Sep 17 00:00:00 2001 +From: Leonard Richardson +Date: Mon, 8 Dec 2025 19:34:16 -0500 +Subject: [PATCH] * Change the html.parser tree builder's code for handling + numeric character references, to avoid a crash when using Python versions + that include the fix to Python issue https://bugs.python.org/issue13633 + (e.g. Python 3.11.13). [bug=2134393] + +--- + CHANGELOG | 5 +++ + bs4/builder/_htmlparser.py | 78 +++++++++++++++++++++++++++++------- + bs4/tests/test_htmlparser.py | 17 ++++++++ + 3 files changed, 86 insertions(+), 14 deletions(-) + +diff --git a/CHANGELOG b/CHANGELOG +index f61b7e9..606e9f5 100644 +--- a/CHANGELOG ++++ b/CHANGELOG +@@ -1,5 +1,10 @@ + = Unreleased + ++* Change the html.parser tree builder's code for handling numeric ++ character references, to avoid a crash when using Python versions ++ that include the fix to Python issue https://bugs.python.org/issue13633 ++ (e.g. Python 3.11.13). [bug=2134393] ++ + * Skip the lxml tree builder's test_surrogate_in_character_reference test + if the libxml2 version is less than 2.13.0. Prior versions of libxml2 + don't issue the REPLACEMENT CHARACTER we're expecting. [bug=2134346] +diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py +index 165a3d8..ead800f 100644 +--- a/bs4/builder/_htmlparser.py ++++ b/bs4/builder/_htmlparser.py +@@ -10,6 +10,7 @@ __all__ = [ + ] + + from html.parser import HTMLParser ++import re + + from typing import ( + Any, +@@ -223,6 +224,64 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): + """Handle some textual data that shows up between tags.""" + self.soup.handle_data(data) + ++ _DECIMAL_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9]+)(.*)") ++ _HEX_REFERENCE_WITH_FOLLOWING_DATA = re.compile("^([0-9a-f]+)(.*)") ++ ++ @classmethod ++ def _dereference_numeric_character_reference(cls, name:str) -> Tuple[str, bool, str]: ++ """Convert a numeric character reference into an actual character. ++ ++ :param name: The number of the character reference, as ++ obtained by html.parser ++ ++ :return: A 3-tuple (dereferenced, replacement_added, ++ extra_data). `dereferenced` is the dereferenced character ++ reference, or the empty string if there was no ++ reference. `replacement_added` is True if the reference ++ could only be dereferenced by replacing content with U+FFFD ++ REPLACEMENT CHARACTER. `extra_data` is a portion of data ++ following the character reference, which was deemed to be ++ normal data and not part of the reference at all. ++ """ ++ dereferenced:str = "" ++ replacement_added:bool = False ++ extra_data:str = "" ++ ++ base:int = 10 ++ reg = cls._DECIMAL_REFERENCE_WITH_FOLLOWING_DATA ++ if name.startswith("x") or name.startswith("X"): ++ # Hex reference ++ name = name[1:] ++ base = 16 ++ reg = cls._HEX_REFERENCE_WITH_FOLLOWING_DATA ++ ++ real_name:Optional[int] = None ++ try: ++ real_name = int(name, base) ++ except ValueError: ++ # This is either bad data that starts with what looks like ++ # a numeric character reference, or a real numeric ++ # reference that wasn't terminated by a semicolon. ++ # ++ # The fix to https://bugs.python.org/issue13633 made it ++ # our responsibility to handle the extra data. ++ # ++ # To preserve the old behavior, we extract the numeric ++ # portion of the incoming "reference" and treat that as a ++ # numeric reference. All subsequent data will be processed ++ # as string data. ++ match = reg.search(name) ++ if match is not None: ++ real_name = int(match.groups()[0], base) ++ extra_data = match.groups()[1] ++ ++ if real_name is None: ++ dereferenced = "" ++ extra_data = name ++ else: ++ dereferenced, replacement_added = UnicodeDammit.numeric_character_reference(real_name) ++ return dereferenced, replacement_added, extra_data ++ + def handle_charref(self, name: str) -> None: + """Handle a numeric character reference by converting it to the + corresponding Unicode character and treating it as textual +@@ -230,22 +289,13 @@ class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): + + :param name: Character number, possibly in hexadecimal. + """ +- # TODO: This was originally a workaround for a bug in +- # HTMLParser. (http://bugs.python.org/issue13633) The bug has +- # been fixed, but removing this code still makes some +- # Beautiful Soup tests fail. This needs investigation. +- real_name:int +- if name.startswith("x"): +- real_name = int(name.lstrip("x"), 16) +- elif name.startswith("X"): +- real_name = int(name.lstrip("X"), 16) +- else: +- real_name = int(name) +- +- data, replacement_added = UnicodeDammit.numeric_character_reference(real_name) ++ dereferenced, replacement_added, extra_data = self._dereference_numeric_character_reference(name) + if replacement_added: + self.soup.contains_replacement_characters = True +- self.handle_data(data) ++ if dereferenced is not None: ++ self.handle_data(dereferenced) ++ if extra_data is not None: ++ self.handle_data(extra_data) + + def handle_entityref(self, name: str) -> None: + """Handle a named entity reference by converting it to the +diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py +index 0086a9d..cb85b53 100644 +--- a/bs4/tests/test_htmlparser.py ++++ b/bs4/tests/test_htmlparser.py +@@ -162,3 +162,20 @@ class TestHTMLParserTreeBuilder(HTMLTreeBuilderSmokeTest): + # Since we do the replacement ourselves, we can set contains_replacement_characters appropriately. + # lxml and html5lib do the replacement so all we ever see is REPLACEMENT CHARACTER. + assert soup.contains_replacement_characters == True ++ ++class TestBeautifulSoupHTMLParser: ++ def test_dereference_numeric_character_reference(self): ++ m = BeautifulSoupHTMLParser._dereference_numeric_character_reference ++ assert m("64") == ("@", False, "") ++ assert m("x64") == ("d", False, "") ++ assert m("X64") == ("d", False, "") ++ assert m("64andsomeextra") == ("@", False, "andsomeextra") ++ assert m("") == ("", False, "") ++ assert m("00whee") == ("�", True, "whee") ++ assert m("xfffdthatsit") == ("�", False, "thatsit") ++ assert m("xabcdplussomeextra") == ("ꯍ", False, "plussomeextra") ++ assert m("obviouslynotnumeric") == ("", False, "obviouslynotnumeric") ++ ++ # These are almost certainly wrong but at least it doesn't crash. ++ assert m("xabcdandsomeextra") == ("\U000abcda", False, "ndsomeextra") ++ assert m("xffffffffffffffffffffffbeep") == ("�", True, "p") +-- +2.52.0 + diff --git a/SPECS/python-beautifulsoup4/2000-python-beautifulsoup4-4.14.3-test.patch b/SPECS/python-beautifulsoup4/2000-python-beautifulsoup4-4.14.3-test.patch new file mode 100644 index 000000000..1bb3a7a6f --- /dev/null +++ b/SPECS/python-beautifulsoup4/2000-python-beautifulsoup4-4.14.3-test.patch @@ -0,0 +1,25 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: HNO3Miracle +Date: Tue, 14 Apr 2026 16:30:00 +0800 +Subject: [PATCH] python-beautifulsoup4: adjust surrogate-character test for + Python 3.15 + +--- + bs4/tests/__init__.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py +index 3650371..2b16f8a 100644 +--- a/bs4/tests/__init__.py ++++ b/bs4/tests/__init__.py +@@ -1125,7 +1125,7 @@ Hello, world! + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + +- @pytest.mark.skipif("sys.version_info < (3, 8)") ++ @pytest.mark.skipif("sys.version_info < (3, 15)") + def test_surrogate_in_character_reference(self): + # These character references are invalid and should be replaced with REPLACEMENT CHARACTER. + soup = self.soup("��") +-- +2.51.0 diff --git a/SPECS/python-beautifulsoup4/python-beautifulsoup4.spec b/SPECS/python-beautifulsoup4/python-beautifulsoup4.spec new file mode 100644 index 000000000..222515222 --- /dev/null +++ b/SPECS/python-beautifulsoup4/python-beautifulsoup4.spec @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: (C) 2026 Institute of Software, Chinese Academy of Sciences (ISCAS) +# SPDX-FileCopyrightText: (C) 2026 openRuyi Project Contributors +# SPDX-FileContributor: HNO3Miracle +# +# SPDX-License-Identifier: MulanPSL-2.0 + +%global srcname beautifulsoup4 + +Name: python-beautifulsoup4 +Version: 4.14.3 +Release: %autorelease +Summary: HTML/XML parser for quick-turnaround applications like screen-scraping +License: MIT +URL: http://www.crummy.com/software/BeautifulSoup/ +#!RemoteAsset: sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86 +Source0: https://files.pythonhosted.org/packages/source/b/%{srcname}/%{srcname}-%{version}.tar.gz +BuildArch: noarch +BuildSystem: pyproject + +BuildOption(install): -l bs4 + +# Patches from upstream +# https://git.launchpad.net/beautifulsoup/commit/?id=ec4a722af07341c4aa3fe604b077a1f773c6fdd2 +# Skip the lxml surrogate-character test with older libxml2 releases +Patch0: 0001-Skip-the-lxml-tree-builder-s-test_surrogate_in_chara.patch +# https://git.launchpad.net/beautifulsoup/commit/?id=55f655ffb7ef03bdd1df0f013743831fe54e3c7a +# Fix html.parser numeric character reference handling for newer Python +Patch1: 0002-Change-the-html.parser-tree-builder-s-code-for-handl.patch +# Local openRuyi patch to adjust the surrogate-character test expectation for Python 3.15 +Patch2000: 2000-python-beautifulsoup4-4.14.3-test.patch + +BuildRequires: pyproject-rpm-macros +BuildRequires: pkgconfig(python3) +BuildRequires: python3dist(pip) +BuildRequires: python3dist(setuptools) +BuildRequires: python3dist(hatchling) +BuildRequires: python3dist(typing-extensions) +BuildRequires: python3dist(packaging) +BuildRequires: python3dist(soupsieve) +BuildRequires: python3dist(pytest) +BuildRequires: python3dist(html5lib) +BuildRequires: python3dist(lxml) + +Provides: python3-beautifulsoup4 = %{version}-%{release} +%python_provide python3-beautifulsoup4 + +%description +Beautiful Soup is a Python HTML/XML parser designed for quick +turnaround projects like screen-scraping. + +%generate_buildrequires +%pyproject_buildrequires + +%prep +%autosetup -p1 -n %{srcname}-%{version} + +# Physically remove tox to be sure +rm -f tox.ini + +%check -a +%pytest + +%files -f %{pyproject_files} +%doc NEWS.txt CHANGELOG +%license LICENSE + +%changelog +%autochangelog diff --git a/SPECS/python-soupsieve/python-soupsieve.spec b/SPECS/python-soupsieve/python-soupsieve.spec index 331486301..9e89ab4ad 100644 --- a/SPECS/python-soupsieve/python-soupsieve.spec +++ b/SPECS/python-soupsieve/python-soupsieve.spec @@ -23,6 +23,8 @@ BuildOption(install): -l soupsieve BuildRequires: pyproject-rpm-macros BuildRequires: pkgconfig(python3) +BuildRequires: python3dist(hatchling) >= 0.21.1 +BuildRequires: python3dist(pip) >= 19 BuildRequires: python3dist(setuptools) %if %{with tests} BuildRequires: python3dist(pytest)