diff --git a/Dockerfile b/Dockerfile index e1e391094f..5fc748862e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:focal as app +FROM ubuntu:jammy as app ARG PYTHON_VERSION=3.12 diff --git a/course_discovery/apps/course_metadata/tests/test_utils.py b/course_discovery/apps/course_metadata/tests/test_utils.py index 7ce9161016..e3748cbad6 100644 --- a/course_discovery/apps/course_metadata/tests/test_utils.py +++ b/course_discovery/apps/course_metadata/tests/test_utils.py @@ -880,10 +880,21 @@ class UtilsTests(TestCase): '

Some text

\n

· Item 1

\n\n

Regular paragraph

\n

· Item 3

' ) ) + @ddt.data( + ( + '

The content of this course also forms part of the six-month onlineExample Link

', # pylint: disable=line-too-long + '

The content of this course also forms part of the six-month online Example Link

' # pylint: disable=line-too-long + ), + ( + '

online course.

Module 1:

', + '

online course. Module 1:

' + ) + ) @ddt.unpack def test_clean_html(self, content, expected): """ Verify the method removes unnecessary HTML attributes. """ - assert clean_html(content) == expected + result = clean_html(content) + assert result == expected, f"\nExpected:\n{expected}\nGot:\n{result}" def test_skill_data_transformation(self): category_data = { diff --git a/course_discovery/apps/course_metadata/utils.py b/course_discovery/apps/course_metadata/utils.py index 2f3fe88e61..e877aeecb8 100644 --- a/course_discovery/apps/course_metadata/utils.py +++ b/course_discovery/apps/course_metadata/utils.py @@ -774,6 +774,8 @@ def clean_html(content): (indicating right-to-left direction), this method will ensure that the 'dir' attribute is preserved or added to maintain consistency with the original content. """ + if not content: + return '' LIST_TAGS = ['ul', 'ol'] is_list_with_dir_attr_present = False @@ -790,12 +792,13 @@ def clean_html(content): cleaned = cleaned.replace('

', '') html_converter = HTML2TextWithLangSpans(bodywidth=None) html_converter.wrap_links = False - cleaned = html_converter.handle(cleaned).strip() - cleaned = markdown.markdown(cleaned) - for tag in LIST_TAGS: - cleaned = cleaned.replace(f'<{tag}>', f'<{tag} dir="rtl">') if is_list_with_dir_attr_present else cleaned - - return cleaned + markdown_text = html_converter.handle(cleaned).strip() + cleaned = markdown.markdown(markdown_text) + cleaned = re.sub(r'([^\s>])\s*(', f'<{tag} dir="rtl">') + return cleaned.strip() def get_file_from_drive_link(image_url):