From da65b7503b1a6f7f2d14668e665b10b00e03b193 Mon Sep 17 00:00:00 2001 From: Tom Dupisre Date: Mon, 28 Jul 2025 14:59:37 +0200 Subject: [PATCH] Correction bytes --- readability/encoding.py | 10 +++++----- tests/test_article_only.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/readability/encoding.py b/readability/encoding.py index 08332df..c75cafd 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -5,9 +5,9 @@ import chardet -RE_CHARSET = re.compile(r']', flags=re.I) -RE_PRAGMA = re.compile(r']', flags=re.I) -RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') +RE_CHARSET = re.compile(br']', flags=re.I) +RE_PRAGMA = re.compile(br']', flags=re.I) +RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]') CHARSETS = { "big5": "big5hkscs", @@ -53,11 +53,11 @@ def get_encoding(page): # Fallback to chardet if declared encodings fail # Remove all HTML tags, and leave only text for chardet - text = re.sub(r'(\s*]*>)+\s*', ' ', page).strip() enc = 'utf-8' + text = re.sub(r'(\s*]*>)+\s*', ' ', page.decode(enc, "replace")).strip() if len(text) < 10: return enc # can't guess - res = chardet.detect(text) + res = chardet.detect(page) enc = res["encoding"] or "utf-8" # print '->', enc, "%.2f" % res['confidence'] enc = fix_charset(enc) diff --git a/tests/test_article_only.py b/tests/test_article_only.py index fe32212..86797e4 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -94,6 +94,17 @@ def test_best_elem_is_root_and_passing(self): doc = Document(sample) doc.summary() + def test_html_input_in_bytes(self): + sample = ( + b'' + b" " + b"

1234567890123456789012345

" + b" " + b"" + ) + doc = Document(sample) + doc.summary() + def test_correct_cleanup(self): sample = """